BeautifulSoup#

Data collection can be achieved through the creation of ad-hoc scripts employing BeautifulSoup, a Python library that coordinates several modules and further libraries for “pulling data out of HTML and XML files” [Richardson, 2023].
Scripts included here (and in the book) represent a simplified version of the ones included in [Bondi and Di Cristofaro, 2023].

Extracting the data#

2CATLISM, 164-166

Download HTML pages2CATLISM, 164-166#

Script [s5.02a] #
 1# Import modules for: reading/writing CSV files; using regular expressions; working with SSL certificates;
 2# loading files using regular expression; pausing the script; using BeautifulSoup
 3import csv
 4import re
 5import ssl
 6from glob import glob
 7from time import sleep
 8from bs4 import BeautifulSoup
 9
10# the variable and the import below allow the crawler to browse https pages when invalid certificates are used in the
11# website to be scraped. Adapted from:
12# https://stackoverflow.com/questions/50236117/scraping-ssl-certificate-verify-failed-error-for-http-en-wikipedia-org
13ssl._create_default_https_context = ssl._create_unverified_context
14from selenium import webdriver
15
16# Define the options to pass to Firefox webdriver (i.e. the Selenium mechanisms that will control Firefox through the
17# instructions imparted by this script)
18options = webdriver.FirefoxOptions()
19options.add_argument("--headless")
20# Create the Firefox webdriver
21driver = webdriver.Firefox(options=options)
22
23# Find all the filenames containing the string 'links.csv' preceded by any character(s)
24files = glob("*links.csv")
25
26# Set the headers Firefox will use to access the web pages
27headers = {
28    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
29    "Accept-Encoding": "gzip, deflate",
30    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
31}
32
33# For each file found, do:
34for file in files:
35    # Open and read the file as CSV
36    reader_csv = csv.reader(open(file, "r"))
37    # Skip the first line of the CSV containing the header
38    next(reader_csv, None)
39    # Create a list containing all the rows of the CSV
40    rows = [r for r in reader_csv]
41    # For each row (i.e. each link) in the list do:
42    for row in rows:
43        # Search for the string corresponding to the URN in the link
44        urn_search = re.search(r".*?available/(.*?)/", row[0])
45        # Extract the URN and store it in the variable 'urn'
46        urn = urn_search.group(1)
47        # Use the webdriver to read the page indicated by the link
48        driver.get(row[0])
49        # Read the source code of the page using BeautifulSoup, and store it in the variable 'soup'
50        soup = BeautifulSoup(driver.page_source, "lxml")
51        # Open the output file inside the subfolder 'downloaded', using the URN as its filename, followed by '.html'
52        with open("downloaded/" + urn + ".html", "a") as file_output:
53            # Write the source code of the page into the output file
54            file_output.write(str(soup))
55        # Wait 4 secons before restarting the loop
56        sleep(4)
3CATLISM, 166

Use requests in script [s5.02a]3CATLISM, 166#

Script [s5.02b] #
1# Use 'requests' to download the page from the URL appearing in column 1 of the row
2r = requests.get(row[0])
3# Read the contents of the HTML page and store them inside of the variable 'soup'
4soup = BeautifulSoup(r.text, "lxml")
4CATLISM, 166; 168-171

Extract metadata from the downloaded HTML pages4CATLISM, 166; 168-171#

Script [s5.03] #
  1# Import modules for: loading files using regular expression; reading/writing CSV files; using BeautifulSoup
  2import os
  3import glob
  4import csv
  5from bs4 import BeautifulSoup
  6
  7# Set the filename of the CSV file where metadata will be/is stored
  8metadata_file = "metadata_all.csv"
  9
 10# Check if the file already exists; if it does, then:
 11if os.path.isfile(metadata_file):
 12    # Open the file in 'appending' mode ('a') - so that every time a new content is written to it, it is added to the end of
 13    # the file -, and initiate a 'writer' to write the contents in CSV format, using the tav character as delimiter
 14    metadata_writer = csv.writer(
 15        open("metadata_all.csv", "a", encoding="utf-8"), delimiter="\t"
 16    )
 17# If the file does not exist
 18else:
 19    # Create the file in 'appending' mode ('a') - so that every time a new content is written to it, it is added to the end of
 20    # the file -, and initiate a 'writer' to write the contents in CSV format, using the tav character as delimiter
 21    metadata_writer = csv.writer(
 22        open("metadata_all.csv", "a", encoding="utf-8"), delimiter="\t"
 23    )
 24    # Write as first row the names of the columns
 25    metadata_writer.writerow(
 26        [
 27            "doc_id",
 28            "tipo_tesi",
 29            "autore",
 30            "urn",
 31            "titolo_it",
 32            "titolo_en",
 33            "struttura",
 34            "corso_di_studi",
 35            "keywords",
 36            "data",
 37            "disponibilità",
 38            "abstract",
 39        ]
 40    )
 41
 42# Create a list of all the filenames with '.html' extension contained in the subfolder 'downloaded' and all of its possible subfolders
 43files = sorted(glob.glob("./downloaded/*.html", recursive=True))
 44
 45# For each filename found do:
 46for file in files:
 47    # Open the file
 48    f = open(file, encoding="utf8")
 49    # Remove the '.html' extension from the filename
 50    filename = file.replace(".html", "")
 51    # Read the contents of the file with BeautifulSoup and store them inside of the variable 'soup'
 52    soup = BeautifulSoup(f, "lxml")
 53    # Find the <table> element tag and assign its contents to the variable 'table'
 54    table = soup.find("table")
 55    # Inside <table>, find the <tbody> element tag and store its contents inside the variable 'tbody'
 56    tbody = table.find("tbody")
 57    # Find metadata elements by searching for the <th> element tag containing the relevant label (indicated by 'text="LABEL"'),
 58    # and extract the text from the next adjacent <td> element tag (where the metadata value is stored)
 59    tipotesi = tbody.find("th", text="Tipo di tesi").find_next("td").text.strip()
 60    autore = tbody.find("th", text="Autore").find_next("td").text.strip()
 61    urn = tbody.find("th", text="URN").find_next("td").text.strip()
 62    titolo_it = tbody.find("th", text="Titolo").find_next("td").text.strip()
 63    titolo_en = tbody.find("th", text="Titolo in inglese").find_next("td").text.strip()
 64    corso_di_studi = (
 65        tbody.find("th", text="Corso di studi").find_next("td").text.strip()
 66    )
 67    keywords = tbody.find("th", text="Parole chiave").find_next("td").text.strip()
 68    data = tbody.find("th", text="Data inizio appello").find_next("td").text.strip()
 69    disponibilita = tbody.find("th", text="Disponibilità").find_next("td").text.strip()
 70    abstract = tbody.find_all("td", {"colspan": "2"})[1].text
 71
 72    # The following verification is required as some catalogue cards contain a field named 'Settore scientifico disciplinare'
 73    # (Disciplinary scientific area), while others have 'Struttura' (Facility) instead. Either way, the resulting value is stored
 74    # inside of a metadata attribute labelled 'struttura'.
 75    # Check if a <th> tag with value 'Settore scientifico disciplinare' exists; if so:
 76    if tbody.find("th", text="Settore scientifico disciplinare") is not None:
 77        # Extract the value and save it to the variable 'struttura'
 78        struttura = (
 79            tbody.find("th", text="Settore scientifico disciplinare")
 80            .find_next("td")
 81            .text.strip()
 82        )
 83    # If it does not exist, extract the value from the <th> element tag with value 'Struttura'
 84    else:
 85        struttura = tbody.find("th", text="Struttura").find_next("td").text.strip()
 86
 87    # Save all the extracted metadata elements to a list called 'metadata_line', in the order they are to be written in the output CSV
 88    metadata_line = [
 89        filename,
 90        tipotesi,
 91        autore,
 92        urn,
 93        titolo_it,
 94        titolo_en,
 95        struttura,
 96        corso_di_studi,
 97        keywords,
 98        data,
 99        disponibilita,
100        abstract,
101    ]
102    # Write the values stored in 'metadata_line' as one row in the CSV file
103    metadata_writer.writerow(metadata_line)
5CATLISM, 174-175

Download PDF files linked in HTML pages5CATLISM, 174-175#

Script [s5.04] #
 1# Import modules for: regular expressions; downloading data from URLs; loading files using regular expression;
 2# let the script wait a number of seconds before proceeding; using BeautifulSoup
 3import re
 4import urllib.request
 5from glob import glob
 6from time import sleep
 7from bs4 import BeautifulSoup
 8
 9# Create a list of all the filenames with '.html' extension contained in the subfolder 'downloaded' and all of its possible subfolders
10files = glob("./downloaded/*.html")
11
12# For each filename found do:
13for file in files:
14    # Open the file
15    f = open(file, "r", encoding="utf-8")
16    # Read the contents of the file with BeautifulSoup and store them inside of the variable 'soup'
17    soup = BeautifulSoup(f, "lxml")
18    # Check if at least one <a> element tag with the string 'pdf' in its link is present in the HTML code
19    # (i.e. if the page contains at least one link to a PDF file); if so do:
20    if soup.find_all("a", {"href": re.compile("pdf")}):
21        # For each link found, initiate a counter to preserve the order in which the files appear in the 
22        # catalogue card, starting from 0 (the file appearing at the top), and do:
23        for counter, link in enumerate(soup.find_all("a", {"href": re.compile("pdf")})):
24            # Construct the download link by appending the website path to the partial link found in the 'href' attribute
25            file_link = "https://morethesis.unimore.it" + (link["href"])
26            # Extract the URN from 'file_link', and assign it to the variable 'urn_code'
27            urn_code = re.search("(etd.*?)/", file_link).group(1)
28            # Extract the original filename from 'link'
29            filename = link.get_text()
30            # Download the PDF file(s) to the sub-folder 'pdfs' (it must be created manually if it does not already exist),
31            # assigning each file a name according to the structure URN_PROGRESSIVE-NUMBER_ORIGINAL-FILENAME.pdf
32            urllib.request.urlretrieve(
33                file_link, "pdfs/" + urn_code + "_" + str(counter) + "_" + filename
34            )
35            # Wait 4 seconds before downloading any other file
36            sleep(4)
37    # If no <a> element tag with the string 'pdf' is found, move to the next catalogue card
38    else:
39        continue
6CATLISM, 176

Extract the contents of PDF files as plain-text6CATLISM, 176#

Script [s5.05] #
 1# Import modules for: loading files using regular expression; using 'textract' functionalities
 2from glob import glob
 3import textract
 4
 5# List all filenames with the .pdf extension
 6files = glob("*.pdf")
 7
 8# For each filename in the list, do:
 9for file in files:
10    # Remove the '.pdf' extension and save the resulting filename to the variable 'filename'
11    filename = file.replace(".pdf", "")
12    # Open and process the file through 'textract', using UTF-8 as output encoding
13    doc = textract.process(file, output_encoding="utf-8")
14    # Create and open the output file, and write the extracted contents as raw bytes ("wb")
15    with open(filename + ".txt", "wb") as file_output:
16        file_output.write(doc)
7CATLISM, 177-180

Create an XML corpus combining the metadata from HTML pages and the contents of PDF files7CATLISM, 177-180#

Script [s5.06] #
 1# Import modules for: loading files using regular expression; using regular expressions; using dataframes;
 2# reading/writing XML files
 3import glob
 4import re
 5import pandas as pd
 6from lxml import etree
 7
 8# Create a function to remove illegal XML characters. These are control characters identified by code points included in the
 9# ranges defined in the 'return' output. In XML 1.0 the only control characters allowed are tab, line feed, and carriage return (often 
10# interpreted as whitespaces or line-breaks), represented by Unicode code points U+0009, U+000A, U+000D (written in hexadecimal 
11# format in the function, e.g. 0x9 for U+0009). Function adapted from:
12# https://github.com/faizan170/resume-job-match-nlp/blob/573484a9b180950ddd373615e2f09ae163d7b0ae/main.py
13def remove_control_characters(c):
14    # Read the Unicode code point of the character, and store it into the 'codepoint' variable
15    codepoint = ord(c)
16    # Return the character if it is an XML allowed one
17    return (
18        0x20 <= codepoint <= 0xD7FF or
19        codepoint in (0x9, 0xA, 0xD) or
20        0xE000 <= codepoint <= 0xFFFD or
21        0x10000 <= codepoint <= 0x10FFFF
22        )
23
24# Create an empty list to store the found filenames 
25list_of_filenames = []
26# Compile a regular expression to capture the URN code from the filenames that have the string '_0' - indicating that they are the 
27# first file  for each single URN
28urnRegex = re.compile(
29    "etd-[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9]_0*.txt"
30)
31# List all filenames including the URN regular expression plus the '_0' indicating that they are the first file
32# for each single URN
33files = sorted(
34    glob.glob(
35        "etd-[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9]_0*.txt"
36    )
37)
38# Add the found filenames to the list 'list_of_filenames'
39list_of_filenames.append(files)
40
41
42# Create a metadata database (mdb) using the metadata csv file; set the urn as index, and remove duplicates.
43# This is needed since the same thesis can appear more than once if it is catalogued under different categories on MoreThesis.
44mdb = pd.read_csv("metadata_all.csv", sep="\t", encoding="utf-8")
45mdb = mdb.set_index("urn")
46mdb = mdb.groupby(mdb.index).first()
47
48# For each filename in the found ones:
49for file in files:
50    # Extract the URN from the filename
51    urn = re.search("(etd-[0-9]{8}-[0-9]{6})_[0-9]{1,2}.*", file).group(1)
52    # Create the output filename by appending '.xml' to the URN
53    output_file = urn + ".xml"
54    # Create the root tag element <doc> to include all the generated XML contents
55    doc = etree.Element("doc")
56    # Assign a number of attributes to <doc>, extracting their values from 'mdb' using the URN as key to find them - except
57    # for the URN itself
58    doc.attrib["urn"] = urn
59    doc.attrib["type"] = mdb.loc[urn, "tipo_tesi"]
60    # Remove the comma between the author's surname and name
61    doc.attrib["author"] = re.sub(",", "", mdb.loc[urn, "autore"])
62    doc.attrib["title"] = mdb.loc[urn, "titolo_it"]
63    # As not all the theses may have an English title, check if it is so, and assign value 'na' when not available
64    doc.attrib["title_en"] = mdb.loc[urn, "titolo_en"] if not type(None) else "na"
65    doc.attrib["department"] = mdb.loc[urn, "struttura"]
66    doc.attrib["degree"] = mdb.loc[urn, "corso_di_studi"]
67    # Extract the date (in the format YYYY-MM-DD) and capture each part into a group
68    date = re.search("([0-9]{4})-([0-9]{2})-([0-9]{2})", mdb.loc[urn, "data"])
69    doc.attrib["date_y"] = date.group(1)
70    doc.attrib["date_m"] = date.group(2)
71    doc.attrib["date_d"] = date.group(3)
72
73    # Create an empty list to contain the cleaned contents of the thesis    
74    all_thesis_texts = []
75
76    # For each .txt file containing the processed URN, do:
77    for f in sorted(glob.glob(urn + "*.txt")):
78        # Open the file and read its contents)
79        one_file = open(f, "r", encoding="utf8").read()
80        # Using the function 'remove_control_characters', clean the contents from characters that are illegal in XML, and store the
81        # resulting cleaned text in the variable 'cleaned_text'
82        cleaned_text = ''.join(c for c in one_file if remove_control_characters(c))
83        # Add 'cleaned_text' to the list 'all_thesis_texts'
84        all_thesis_texts.append(cleaned_text)
85    
86    # Assign all the texts in 'all_thesis_texts' as text of the <doc> element tag
87    doc.text = " ".join(all_thesis_texts)
88    # Build the XML structure with all the elements collected so far
89    tree = etree.ElementTree(doc)
90    # Write the resulting XML structure to the output file, using utf-8 encoding, adding the XML declaration
91    # at the start of the file and graphically formatting the layout ('pretty_print')
92    tree.write(output_file, pretty_print=True, xml_declaration=True, encoding="utf-8")
8CATLISM, 171-173

Basic structure of the metadata table included in MoreThesis pages8CATLISM, 171-173#

Example [e5.08]#
 1<table border="3" cellpadding="5" cellspacing="5" class="metadata_table">
 2    <tbody>
 3        <tr>
 4            <th width="30%">Tipo di tesi</th>
 5            <td width="70%">TYPE OF THESIS</td>
 6        </tr>
 7        <tr>
 8            <th>Autore</th>
 9            <td>SURNAME, NAME</td>
10        </tr>
11        [...]
12        <tr>
13            <th>Commissione</th>
14            <td>
15                <table>
16                    <tbody>
17                        <tr>
18                            <th align="left">Nome Commissario</th>
19                            <th align="left">Qualifica</th>
20                        </tr>
21                        <tr>
22                            <td align="left">SURNAME NAME</td>
23                            <td align="left">Primo relatore</td>
24                        </tr>
25                        <tr>
26                            <td align="left">SURNAME NAME</td>
27                            <td align="left">Coordinatore Dott Ric</td>
28                        </tr>
29                    </tbody>
30                </table>
31            </td>
32        </tr>
33        <tr>
34            <th>Parole chiave</th>
35            <td>
36                <ul>
37                    <li>KEYWORD1
38                    </li>
39                    <li>KEYWORD2
40                    </li>
41                    <li>KEYWORD3
42                    </li>
43                    <li>KEYWORD4
44                    </li>
45                    <li>KEYWORD5
46                    </li>
47                </ul>
48            </td>
49        </tr>
50        [...]
51        <th>File</th>
52        <td>
53            <table border="2" cellpadding="3" cellspacing="3">
54                <tbody>
55                    [...]
56                    <tr align="center">
57                        <td> </td>
58                        <td align="left"><a
59                                href="/theses/available/etd-NNNNNNNN-NNNNNN/unrestricted/FILENAME.pdf"><b>FILENAME.pdf</b></a>
60                        </td>
61                        <td>5.93 Mb</td>
62                        <td bgcolor="#cccccc">00:27:28</td>
63                        <td>00:14:07</td>
64                        <td bgcolor="#cccccc">00:12:21</td>
65                        <td>00:06:10</td>
66                        <td bgcolor="#cccccc">00:00:31</td>
67                    </tr>
68                    [...]
69                </tbody>
70            </table>
71        </td>
72    </tbody>
73</table>