BeautifulSoup#
Data collection can be achieved through the creation of ad-hoc scripts employing BeautifulSoup
, a Python library that coordinates several modules and further libraries for “pulling data out of HTML and XML files” [].
Scripts included here (and in the book) represent a simplified version of the ones included in [].
Extracting the data#
CATLISM, 162-163
Extract links from HTML pages1CATLISM, 162-163
#
1# Import modules for: regular expressions; loading files using regular expression; reading/writing CSV files;
2# using BeautifulSoup
3import re
4from glob import glob
5import csv
6from bs4 import BeautifulSoup
7
8# List all filenames with the .html extension and store the list in the variable 'files'
9files = glob("*.html")
10# Create the header (i.e. the first row containing the column names) for the output CSV file
11csv_header = ["link", "downloaded"]
12
13# For each found HTML file do:
14for file in files:
15 # Open the file
16 f = open(file, encoding="utf-8")
17 # From the original filename, strip the '.html' extension
18 filename = file.replace(".html", "")
19 # Read the contents of the file through BeautifulSoup and store them inside the variable 'soup'
20 soup = BeautifulSoup(f, "lxml")
21 # Create the CSV output file (named after the original HTML one) to write the output contents
22 with open(filename + "_links.csv", "a") as file_output:
23 # Start writing the output file
24 writer = csv.writer(file_output)
25 # Write the header
26 writer.writerow(csv_header)
27 # Create an empty list to store the collected URLs
28 url = []
29 # Find all URLs matching the regular expression, and for each one do:
30 for link in soup.find_all("a", {"href": re.compile(r".*?theses/available.*?")}):
31 # Write the found URL and write it to the output file
32 writer.writerow([link["href"], "n"])
CATLISM, 164-166
Download HTML pages2CATLISM, 164-166
#
1# Import modules for: reading/writing CSV files; using regular expressions; working with SSL certificates;
2# loading files using regular expression; pausing the script; using BeautifulSoup
3import csv
4import re
5import ssl
6from glob import glob
7from time import sleep
8from bs4 import BeautifulSoup
9
10# the variable and the import below allow the crawler to browse https pages when invalid certificates are used in the
11# website to be scraped. Adapted from:
12# https://stackoverflow.com/questions/50236117/scraping-ssl-certificate-verify-failed-error-for-http-en-wikipedia-org
13ssl._create_default_https_context = ssl._create_unverified_context
14from selenium import webdriver
15
16# Define the options to pass to Firefox webdriver (i.e. the Selenium mechanisms that will control Firefox through the
17# instructions imparted by this script)
18options = webdriver.FirefoxOptions()
19options.add_argument("--headless")
20# Create the Firefox webdriver
21driver = webdriver.Firefox(options=options)
22
23# Find all the filenames containing the string 'links.csv' preceded by any character(s)
24files = glob("*links.csv")
25
26# Set the headers Firefox will use to access the web pages
27headers = {
28 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
29 "Accept-Encoding": "gzip, deflate",
30 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
31}
32
33# For each file found, do:
34for file in files:
35 # Open and read the file as CSV
36 reader_csv = csv.reader(open(file, "r"))
37 # Skip the first line of the CSV containing the header
38 next(reader_csv, None)
39 # Create a list containing all the rows of the CSV
40 rows = [r for r in reader_csv]
41 # For each row (i.e. each link) in the list do:
42 for row in rows:
43 # Search for the string corresponding to the URN in the link
44 urn_search = re.search(r".*?available/(.*?)/", row[0])
45 # Extract the URN and store it in the variable 'urn'
46 urn = urn_search.group(1)
47 # Use the webdriver to read the page indicated by the link
48 driver.get(row[0])
49 # Read the source code of the page using BeautifulSoup, and store it in the variable 'soup'
50 soup = BeautifulSoup(driver.page_source, "lxml")
51 # Open the output file inside the subfolder 'downloaded', using the URN as its filename, followed by '.html'
52 with open("downloaded/" + urn + ".html", "a") as file_output:
53 # Write the source code of the page into the output file
54 file_output.write(str(soup))
55 # Wait 4 seconds before restarting the loop
56 sleep(4)
CATLISM, 166
Use requests
in script [s5.02a]
3CATLISM, 166
#
1# Use 'requests' to download the page from the URL appearing in column 1 of the row
2r = requests.get(row[0])
3# Read the contents of the HTML page and store them inside of the variable 'soup'
4soup = BeautifulSoup(r.text, "lxml")
CATLISM, 166; 168-171
Extract metadata from the downloaded HTML pages4CATLISM, 166; 168-171
#
1# Import modules for: loading files using regular expression; reading/writing CSV files; using BeautifulSoup
2import os
3import glob
4import csv
5from bs4 import BeautifulSoup
6
7# Set the filename of the CSV file where metadata will be/is stored
8metadata_file = "metadata_all.csv"
9
10# Check if the file already exists; if it does, then:
11if os.path.isfile(metadata_file):
12 # Open the file in 'appending' mode ('a') - so that every time a new content is written to it, it is added to the end of
13 # the file -, and initiate a 'writer' to write the contents in CSV format, using the tav character as delimiter
14 metadata_writer = csv.writer(
15 open("metadata_all.csv", "a", encoding="utf-8"), delimiter="\t"
16 )
17# If the file does not exist
18else:
19 # Create the file in 'appending' mode ('a') - so that every time a new content is written to it, it is added to the end of
20 # the file -, and initiate a 'writer' to write the contents in CSV format, using the tav character as delimiter
21 metadata_writer = csv.writer(
22 open("metadata_all.csv", "a", encoding="utf-8"), delimiter="\t"
23 )
24 # Write as first row the names of the columns
25 metadata_writer.writerow(
26 [
27 "doc_id",
28 "tipo_tesi",
29 "autore",
30 "urn",
31 "titolo_it",
32 "titolo_en",
33 "struttura",
34 "corso_di_studi",
35 "keywords",
36 "data",
37 "disponibilità",
38 "abstract",
39 ]
40 )
41
42# Create a list of all the filenames with '.html' extension contained in the subfolder 'downloaded' and all of its possible subfolders
43files = sorted(glob.glob("./downloaded/*.html", recursive=True))
44
45# For each filename found do:
46for file in files:
47 # Open the file
48 f = open(file, encoding="utf8")
49 # Remove the '.html' extension from the filename
50 filename = file.replace(".html", "")
51 # Read the contents of the file with BeautifulSoup and store them inside of the variable 'soup'
52 soup = BeautifulSoup(f, "lxml")
53 # Find the <table> element tag and assign its contents to the variable 'table'
54 table = soup.find("table")
55 # Inside <table>, find the <tbody> element tag and store its contents inside the variable 'tbody'
56 tbody = table.find("tbody")
57 # Find metadata elements by searching for the <th> element tag containing the relevant label (indicated by 'text="LABEL"'),
58 # and extract the text from the next adjacent <td> element tag (where the metadata value is stored)
59 tipotesi = tbody.find("th", text="Tipo di tesi").find_next("td").text.strip()
60 autore = tbody.find("th", text="Autore").find_next("td").text.strip()
61 urn = tbody.find("th", text="URN").find_next("td").text.strip()
62 titolo_it = tbody.find("th", text="Titolo").find_next("td").text.strip()
63 titolo_en = tbody.find("th", text="Titolo in inglese").find_next("td").text.strip()
64 corso_di_studi = (
65 tbody.find("th", text="Corso di studi").find_next("td").text.strip()
66 )
67 keywords = tbody.find("th", text="Parole chiave").find_next("td").text.strip()
68 data = tbody.find("th", text="Data inizio appello").find_next("td").text.strip()
69 disponibilita = tbody.find("th", text="Disponibilità").find_next("td").text.strip()
70 abstract = tbody.find_all("td", {"colspan": "2"})[1].text
71
72 # The following verification is required as some catalogue cards contain a field named 'Settore scientifico disciplinare'
73 # (Disciplinary scientific area), while others have 'Struttura' (Facility) instead. Either way, the resulting value is stored
74 # inside of a metadata attribute labelled 'struttura'.
75 # Check if a <th> tag with value 'Settore scientifico disciplinare' exists; if so:
76 if tbody.find("th", text="Settore scientifico disciplinare") is not None:
77 # Extract the value and save it to the variable 'struttura'
78 struttura = (
79 tbody.find("th", text="Settore scientifico disciplinare")
80 .find_next("td")
81 .text.strip()
82 )
83 # If it does not exist, extract the value from the <th> element tag with value 'Struttura'
84 else:
85 struttura = tbody.find("th", text="Struttura").find_next("td").text.strip()
86
87 # Save all the extracted metadata elements to a list called 'metadata_line', in the order they are to be written in the output CSV
88 metadata_line = [
89 filename,
90 tipotesi,
91 autore,
92 urn,
93 titolo_it,
94 titolo_en,
95 struttura,
96 corso_di_studi,
97 keywords,
98 data,
99 disponibilita,
100 abstract,
101 ]
102 # Write the values stored in 'metadata_line' as one row in the CSV file
103 metadata_writer.writerow(metadata_line)
CATLISM, 174-175
Download PDF files linked in HTML pages5CATLISM, 174-175
#
1# Import modules for: regular expressions; downloading data from URLs; loading files using regular expression;
2# let the script wait a number of seconds before proceeding; using BeautifulSoup
3import re
4import urllib.request
5from glob import glob
6from time import sleep
7from bs4 import BeautifulSoup
8
9# Create a list of all the filenames with '.html' extension contained in the subfolder 'downloaded' and all of its possible subfolders
10files = glob("./downloaded/*.html")
11
12# For each filename found do:
13for file in files:
14 # Open the file
15 f = open(file, "r", encoding="utf-8")
16 # Read the contents of the file with BeautifulSoup and store them inside of the variable 'soup'
17 soup = BeautifulSoup(f, "lxml")
18 # Check if at least one <a> element tag with the string 'pdf' in its link is present in the HTML code
19 # (i.e. if the page contains at least one link to a PDF file); if so do:
20 if soup.find_all("a", {"href": re.compile("pdf")}):
21 # For each link found, initiate a counter to preserve the order in which the files appear in the
22 # catalogue card, starting from 0 (the file appearing at the top), and do:
23 for counter, link in enumerate(soup.find_all("a", {"href": re.compile("pdf")})):
24 # Construct the download link by appending the website path to the partial link found in the 'href' attribute
25 file_link = "https://morethesis.unimore.it" + (link["href"])
26 # Extract the URN from 'file_link', and assign it to the variable 'urn_code'
27 urn_code = re.search("(etd.*?)/", file_link).group(1)
28 # Extract the original filename from 'link'
29 filename = link.get_text()
30 # Download the PDF file(s) to the sub-folder 'pdfs' (it must be created manually if it does not already exist),
31 # assigning each file a name according to the structure URN_PROGRESSIVE-NUMBER_ORIGINAL-FILENAME.pdf
32 urllib.request.urlretrieve(
33 file_link, "pdfs/" + urn_code + "_" + str(counter) + "_" + filename
34 )
35 # Wait 4 seconds before downloading any other file
36 sleep(4)
37 # If no <a> element tag with the string 'pdf' is found, move to the next catalogue card
38 else:
39 continue
CATLISM, 176
Extract the contents of PDF files as plain-text6CATLISM, 176
#
1# Import modules for: loading files using regular expression; using 'textract' functionalities
2from glob import glob
3import textract
4
5# List all filenames with the .pdf extension
6files = glob("*.pdf")
7
8# For each filename in the list, do:
9for file in files:
10 # Remove the '.pdf' extension and save the resulting filename to the variable 'filename'
11 filename = file.replace(".pdf", "")
12 # Open and process the file through 'textract', using UTF-8 as output encoding
13 doc = textract.process(file, output_encoding="utf-8")
14 # Create and open the output file, and write the extracted contents as raw bytes ("wb")
15 with open(filename + ".txt", "wb") as file_output:
16 file_output.write(doc)
CATLISM, 177-180
Create an XML corpus combining the metadata from HTML pages and the contents of PDF files7CATLISM, 177-180
#
1# Import modules for: loading files using regular expression; using regular expressions; using dataframes;
2# reading/writing XML files
3import glob
4import re
5import pandas as pd
6from lxml import etree
7
8# Create a function to remove illegal XML characters. These are control characters identified by code points included in the
9# ranges defined in the 'return' output. In XML 1.0 the only control characters allowed are tab, line feed, and carriage return (often
10# interpreted as whitespaces or line-breaks), represented by Unicode code points U+0009, U+000A, U+000D (written in hexadecimal
11# format in the function, e.g. 0x9 for U+0009). Function adapted from:
12# https://github.com/faizan170/resume-job-match-nlp/blob/573484a9b180950ddd373615e2f09ae163d7b0ae/main.py
13def remove_control_characters(c):
14 # Read the Unicode code point of the character, and store it into the 'codepoint' variable
15 codepoint = ord(c)
16 # Return the character if it is an XML allowed one
17 return (
18 0x20 <= codepoint <= 0xD7FF or
19 codepoint in (0x9, 0xA, 0xD) or
20 0xE000 <= codepoint <= 0xFFFD or
21 0x10000 <= codepoint <= 0x10FFFF
22 )
23
24# Create an empty list to store the found filenames
25list_of_filenames = []
26# Compile a regular expression to capture the URN code from the filenames that have the string '_0' - indicating that they are the
27# first file for each single URN
28urnRegex = re.compile(
29 "etd-[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9]_0*.txt"
30)
31# List all filenames including the URN regular expression plus the '_0' indicating that they are the first file
32# for each single URN
33files = sorted(
34 glob.glob(
35 "etd-[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9]_0*.txt"
36 )
37)
38# Add the found filenames to the list 'list_of_filenames'
39list_of_filenames.append(files)
40
41
42# Create a metadata database (mdb) using the metadata csv file; set the urn as index, and remove duplicates.
43# This is needed since the same thesis can appear more than once if it is catalogued under different categories on MoreThesis.
44mdb = pd.read_csv("metadata_all.csv", sep="\t", encoding="utf-8")
45mdb = mdb.set_index("urn")
46mdb = mdb.groupby(mdb.index).first()
47
48# For each filename in the found ones:
49for file in files:
50 # Extract the URN from the filename
51 urn = re.search("(etd-[0-9]{8}-[0-9]{6})_[0-9]{1,2}.*", file).group(1)
52 # Create the output filename by appending '.xml' to the URN
53 output_file = urn + ".xml"
54 # Create the root tag element <doc> to include all the generated XML contents
55 doc = etree.Element("doc")
56 # Assign a number of attributes to <doc>, extracting their values from 'mdb' using the URN as key to find them - except
57 # for the URN itself
58 doc.attrib["urn"] = urn
59 doc.attrib["type"] = mdb.loc[urn, "tipo_tesi"]
60 # Remove the comma between the author's surname and name
61 doc.attrib["author"] = re.sub(",", "", mdb.loc[urn, "autore"])
62 doc.attrib["title"] = mdb.loc[urn, "titolo_it"]
63 # As not all the theses may have an English title, check if it is so, and assign value 'na' when not available
64 doc.attrib["title_en"] = mdb.loc[urn, "titolo_en"] if not type(None) else "na"
65 doc.attrib["department"] = mdb.loc[urn, "struttura"]
66 doc.attrib["degree"] = mdb.loc[urn, "corso_di_studi"]
67 # Extract the date (in the format YYYY-MM-DD) and capture each part into a group
68 date = re.search("([0-9]{4})-([0-9]{2})-([0-9]{2})", mdb.loc[urn, "data"])
69 doc.attrib["date_y"] = date.group(1)
70 doc.attrib["date_m"] = date.group(2)
71 doc.attrib["date_d"] = date.group(3)
72
73 # Create an empty list to contain the cleaned contents of the thesis
74 all_thesis_texts = []
75
76 # For each .txt file containing the processed URN, do:
77 for f in sorted(glob.glob(urn + "*.txt")):
78 # Open the file and read its contents)
79 one_file = open(f, "r", encoding="utf8").read()
80 # Using the function 'remove_control_characters', clean the contents from characters that are illegal in XML, and store the
81 # resulting cleaned text in the variable 'cleaned_text'
82 cleaned_text = ''.join(c for c in one_file if remove_control_characters(c))
83 # Add 'cleaned_text' to the list 'all_thesis_texts'
84 all_thesis_texts.append(cleaned_text)
85
86 # Assign all the texts in 'all_thesis_texts' as text of the <doc> element tag
87 doc.text = " ".join(all_thesis_texts)
88 # Build the XML structure with all the elements collected so far
89 tree = etree.ElementTree(doc)
90 # Write the resulting XML structure to the output file, using utf-8 encoding, adding the XML declaration
91 # at the start of the file and graphically formatting the layout ('pretty_print')
92 tree.write(output_file, pretty_print=True, xml_declaration=True, encoding="utf-8")
CATLISM, 171-173
Basic structure of the metadata table included in MoreThesis pages8CATLISM, 171-173
#
1<table border="3" cellpadding="5" cellspacing="5" class="metadata_table">
2 <tbody>
3 <tr>
4 <th width="30%">Tipo di tesi</th>
5 <td width="70%">TYPE OF THESIS</td>
6 </tr>
7 <tr>
8 <th>Autore</th>
9 <td>SURNAME, NAME</td>
10 </tr>
11 [...]
12 <tr>
13 <th>Commissione</th>
14 <td>
15 <table>
16 <tbody>
17 <tr>
18 <th align="left">Nome Commissario</th>
19 <th align="left">Qualifica</th>
20 </tr>
21 <tr>
22 <td align="left">SURNAME NAME</td>
23 <td align="left">Primo relatore</td>
24 </tr>
25 <tr>
26 <td align="left">SURNAME NAME</td>
27 <td align="left">Coordinatore Dott Ric</td>
28 </tr>
29 </tbody>
30 </table>
31 </td>
32 </tr>
33 <tr>
34 <th>Parole chiave</th>
35 <td>
36 <ul>
37 <li>KEYWORD1
38 </li>
39 <li>KEYWORD2
40 </li>
41 <li>KEYWORD3
42 </li>
43 <li>KEYWORD4
44 </li>
45 <li>KEYWORD5
46 </li>
47 </ul>
48 </td>
49 </tr>
50 [...]
51 <th>File</th>
52 <td>
53 <table border="2" cellpadding="3" cellspacing="3">
54 <tbody>
55 [...]
56 <tr align="center">
57 <td> </td>
58 <td align="left"><a
59 href="/theses/available/etd-NNNNNNNN-NNNNNN/unrestricted/FILENAME.pdf"><b>FILENAME.pdf</b></a>
60 </td>
61 <td>5.93 Mb</td>
62 <td bgcolor="#cccccc">00:27:28</td>
63 <td>00:14:07</td>
64 <td bgcolor="#cccccc">00:12:21</td>
65 <td>00:06:10</td>
66 <td bgcolor="#cccccc">00:00:31</td>
67 </tr>
68 [...]
69 </tbody>
70 </table>
71 </td>
72 </tbody>
73</table>