Facebook#
Data collection from Facebook can be obtained using facebook-scraper
.
Options and arguments for the tool can be found in the official documentation.
CATLISM, 227
Installing the tool1CATLISM, 227
#
pip install facebook-scraper
[c5.26]
CATLISM, 227
Using the tool2CATLISM, 227
#
facebook-scraper --filename OUTPUT.json --format json --comments --pages N PROFILE_NAME
Extracting the data#
CATLISM, 237-242
Extract data from posts3CATLISM, 237-242
#
1# Import modules for: regular expressions; reading JSON files; loading files using regular expression;
2# working with XML files; reading timestamps as date objects;
3import re
4import json
5import glob
6from lxml import etree
7from dateutil import parser
8
9# Get a list of all the files with .json extension
10files = glob.glob("*.json")
11# For each file, do:
12for single_file in files:
13 # Delete the extension from the filename and store it inside of the 'filename' variable
14 filename = single_file.replace(".json", "")
15
16 # Open the file and do:
17 with open(single_file, encoding="utf-8") as f:
18 # Read its contents and save them to the 'contents' variable
19 contents = f.read()
20 # Use a regular expression to remove extra commas between JSON objects; this fixes a bug in the current version of
21 # facebook-scraper (v0.2.59) whereby more than one comma may be inserted between JSON objects, rendering the contents
22 # unparsable by Python
23 contents = re.sub("\},{2,}\{", "},{", contents)
24
25 # Load the read contents as JSON
26 data = json.loads(contents)
27 # Create the <text> root element tag where the post contents and details are stored in the final XML
28 text_tag = etree.Element("text")
29
30 # For each object in the JSON, corresponding to one post, do:
31 for post in data:
32 # Create the <post> element tag to enclose one single post
33 post_tag = etree.SubElement(text_tag, "post")
34 # Assign a set of attributes to <post> using the data extracted from the metadata data-points, as well as the
35 # textual content of the post as text of the <post> element tag
36 post_tag.attrib["id"] = post["post_id"]
37 post_tag.attrib["author"] = post["username"]
38 post_tag.attrib["author_id"] = str(post["user_id"])
39 post_tag.attrib["comments"] = str(post["comments"])
40 post_tag.attrib["shares"] = str(post["shares"])
41 post_tag.text = post["text"]
42
43 # Extract the date and transform it into a datetime object, then extract the two-digit day and month along the 4-digit
44 # year and assign them to the three attributes 'date_d', 'date_m', and 'date_y' respectively
45 post_date = parser.parse(post["time"])
46 post_tag.attrib["date_d"] = str(post_date.day)
47 post_tag.attrib["date_m"] = str(post_date.month)
48 post_tag.attrib["date_y"] = str(post_date.year)
49
50 # Check if the number of 'likes' and 'reactions' is present: if so, assign the values to the two attributes, if not
51 # assign the value 0
52 post_tag.attrib["likes"] = str(
53 post["likes"] if post["likes"] is not None else 0
54 )
55 post_tag.attrib["reactions_count"] = str(
56 post["reaction_count"] if post["reaction_count"] is not None else 0
57 )
58
59 # Check if details concerning reactions are present, i.e. if the dictionary for reactions exists and the key 'sad' exists
60 reactions = post.get("reactions")
61 if isinstance(reactions, dict) and reactions.get("sad"):
62 # If present, assign the total number of 'sad' reactions to the 'reactions_sad' attribute
63 post_tag.attrib["reaction_sad"] = str(reactions.get("sad"))
64 else:
65 # If not, assign the value 0
66 post_tag.attrib["reaction_sad"] = "0"
67
68 # Check if the array of comments is not empty, i.e. if check if the metadata data-point for comments is present
69 # (if the data was collected withouth the '--comments' the array is always empty; if '--comments' was used,
70 # array may be empty if no comments were made to the post. If the array is empty, proceed with the next item
71 if post["comments_full"] is None:
72 continue
73
74 # For each found comment, do:
75 for comment in post["comments_full"]:
76 # Create the <comment> element tag to enclose the contents of the comment
77 comment_tag = etree.SubElement(post_tag, "comment")
78 # Assign a set of attributes to <comment>, including 'type' with value 'c' indicating this is a comment
79 # and not a reply to a comment (identified by the value 'r'), as well as the textual content of the
80 # post as text of the <post> element tag
81 comment_tag.attrib["type"] = "c"
82 comment_tag.attrib["comment_to"] = post["post_id"]
83 comment_tag.attrib["id"] = comment["comment_id"]
84 comment_tag.attrib["author"] = comment["commenter_name"]
85 comment_tag.attrib["author_id"] = comment["commenter_id"]
86
87 # Extract the date and transform it into a datetime object - if present, otherwise
88 # extract the two-digit day and month along the 4-digit year and assign them to
89 # the three attributes 'date_d', 'date_m', and 'date_y' respectively
90 comment_date = (
91 parser.parse(comment["comment_time"])
92 if comment["comment_time"] is not None
93 else None
94 )
95
96 comment_tag.attrib["date_d"] = str(
97 comment_date.day if comment_date is not None else "na"
98 )
99 comment_tag.attrib["date_m"] = str(
100 comment_date.month if comment_date is not None else "na"
101 )
102 comment_tag.attrib["date_y"] = str(
103 comment_date.year if comment_date is not None else "na"
104 )
105 comment_tag.text = comment["comment_text"]
106
107 # Check if the array of replies exists; if it does not, proceed with the next item
108 if not comment["replies"]:
109 continue
110
111 # For each reply found, do:
112 for reply in comment["replies"]:
113 # Create the <comment> element tag to enclose the contents of the reply
114 reply_tag = etree.SubElement(post_tag, "comment")
115
116 # Assign a set of attributes to <comment>, including 'type' with value 'r' indicating
117 # this is a reply to a comment and not a direct comment to the post (identified by the value 'c'),
118 # as well as the textual content of the post as text of the <post> element tag
119 reply_tag.attrib["type"] = "r"
120 reply_tag.attrib["comment_to"] = comment["comment_id"]
121 reply_tag.attrib["id"] = reply["comment_id"]
122 reply_tag.attrib["author"] = reply["commenter_name"]
123 reply_tag.attrib["author_id"] = reply["commenter_id"]
124 reply_date = (
125 parser.parse(reply["comment_time"])
126 if reply["comment_time"] is not None
127 else None
128 )
129 reply_tag.attrib["date_d"] = str(
130 reply_date.day if reply_date is not None else "na"
131 )
132 reply_tag.attrib["date_m"] = str(
133 reply_date.month if reply_date is not None else "na"
134 )
135 reply_tag.attrib["date_y"] = str(
136 reply_date.year if reply_date is not None else "na"
137 )
138 reply_tag.text = reply["comment_text"]
139
140 # Build the XML structure with all the elements collected so far
141 tree = etree.ElementTree(text_tag)
142 # Write the resulting XML structure to a file named after the input filename, using utf-8 encoding, adding the XML declaration
143 # at the start of the file and graphically formatting the layout ('pretty_print')
144 tree.write(
145 filename + ".xml", pretty_print=True, xml_declaration=True, encoding="utf-8"
146 )
How to use script [s5.10]
#
Copy/download the file s5.10_extract_facebookscraper-posts_json.py
inside the folder where the data downloaded through facebook-scraper
(e.g. through c5.27
) resides; then browse inside the folder through the terminal, e.g.
cd Downloads/facebook_data/
At last, run the script from the terminal:
python s5.10_extract_facebookscraper-posts_json.py
CATLISM, 242-245
Extract data from profiles4CATLISM, 242-245
#
1# Import facebook_scraper module get_profile to get the profile details
2from facebook_scraper import get_profile
3
4# Define the function 'get_profile_details'
5def get_profile_details(profile_id, profiles_db):
6 """Extract details from a Facebook profile; input is the profile ID and the pandas dataframe to which
7 downloaded details are stored. Returns a list with the following elements:
8
9 friend_count = profile_details[0]
10 follower_count = profile_details[1]
11 following_count = profile_details[2]
12 basic = profile_details[3]
13 about = profile_details[4]
14
15 Assumes the existence of:
16 - a file named 'cookies.json' in the current path, containing Facebook cookie
17 - a dataframe with 'profile_id' as index, e.g.:
18 profiles_db = pd.DataFrame(
19 columns=[
20 "profile_id",
21 "friend_count",
22 "follower_count",
23 "following_count",
24 "basic_info",
25 "about",
26 ],
27 ).set_index("profile_id")
28 """
29
30 # If details for the profile ID have already been downloaded:
31 if profile_id in profiles_db.index:
32 # Get the details from the already-downloaded data stored in the 'profiles_db' dataframe
33 friend_count = profiles_db.loc[profile_id, "friend_count"]
34 follower_count = profiles_db.loc[profile_id, "follower_count"]
35 following_count = profiles_db.loc[profile_id, "following_count"]
36 basic_info = profiles_db.loc[profile_id, "basic_info"]
37 about = profiles_db.loc[profile_id, "about"]
38 # Assign all the extracted details to a tuple labelled 'collected_details'
39 collected_details = (
40 friend_count,
41 follower_count,
42 following_count,
43 basic_info,
44 about,
45 )
46 # Output the list with the details
47 return collected_details
48
49 # If the details for the selected profile ID are not present in the 'profiles_db' dataframe:
50 else:
51 # Download the details using the 'get_profile' function from facebook-scraper; this requires Facebook cookies to be present in
52 # the folder where the script is run, and passed to the function using the argument 'cookies=. Details on how to extract them
53 # from the web browser are avaible in the tool's official documentation
54 profile_details = get_profile(profile_id, cookies="cookies_fb.jso")
55 # From the data downloaded by facebook-scraper extract only a number of details, saving each one of them to a separate variable
56 friend_count = str(profile_details["Friend_count"])
57 follower_count = str(profile_details["Follower_count"])
58 following_count = str(profile_details["Following_count"])
59 basic_info = profile_details["Basic info"]
60 # Check if the data-point 'About' exists: if it does, extract the contents of the data-point 'About' and assign it to
61 # the variable 'about'; if it does not assign the value 'None'instead
62 about = profile_details.get("About", "None")
63 # Assign all the extracted details to a tuple labelled 'collected_details'
64 collected_details = (
65 friend_count,
66 follower_count,
67 following_count,
68 basic_info,
69 about,
70 )
71 # Add the list with the downloaded details to the 'profiles_db' dataframe
72 profiles_db.loc[profile_id] = collected_details
73 # Output the list with the details
74 return collected_details
CATLISM, 245-246
Implement the collection of profile details ([s5.11]
) into [s5.10]
5CATLISM, 245-246
#
1# Assuming that script [s5.11] has been saved locally to a file name 'get_profile_details.py
2# it can be implemented into script [s5.10] by adding the following lines where indicated in the comments
3
4# Add the following imports at the bottom of the 'import' section, to use pandas dataframes, and the get_profile_details module
5import pandas as pd
6from get_profile_details import get_profile_details
7
8# Add the following lines after the import section to create a pandas dataframe for storing the downloaded details.
9 profiles_db = pd.DataFrame(
10 columns=[
11 "profile_id",
12 "friend_count",
13 "follower_count",
14 "following_count",
15 "basic_info",
16 "about",
17 ],
18 ).set_index("profile_id")
19
20# Whenever details from a profile need to be extracted, the function 'get_profile_details' can be applied to the extracted
21# 'profile_id'. For example, to download details regarding the author of a comment:
22commenter_details = get_profile_details(comment["commenter_id"])
23
24# Each detail can then be extracted to an XML attribute using e.g.:
25comment_tag.attrib["author_friends"] = commenter_details[0]
26comment_tag.attrib["author_current_place"] = commenter_details[3]
CATLISM, 236-237
Example of data extracted with [s5.10]
6CATLISM, 236-237
#
1<?xml version='1.0' encoding='UTF-8'?>
2<text>
3 <post id="UNIQUE_POST_ID" author="USER_FULL_NAME" author_id="UNIQUE_AUTHOR_ID" comments="NUMBER" shares="NUMBER" date_d="NUMBER" date_m="NUMBER" date_y="NUMBER" likes="NUMBER" reactions_count="NUMBER" reaction_sad="NUMBER">
4 POST TEXTUAL CONTENTS
5 <comment type="c" comment_to="UNIQUE_POST_ID" id="UNIQUE_COMMENT_ID" author="USER_FULL_NAME" author_id="UNIQUE_AUTHOR_ID" date_d="NUMBER" date_m="NUMBER" date_y="NUMBER">COMMENT TEXTUAL CONTENTS</comment>
6 <comment type="r" comment_to="UNIQUE_COMMENT_ID" id="UNIQUE_COMMENT_ID" author="USER_FULL_NAME" author_id="UNIQUE_AUTHOR_ID" date_d="NUMBER" date_m="NUMBER" date_y="NUMBER">REPLY TEXTUAL CONTENTS</comment>
7 </post>
8</text>