File size: 865 Bytes
257dcc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import csv
import requests
from bs4 import BeautifulSoup

with open("./data/littlejsdocs.txt") as file:
    urls = file.readlines()
    # Remove trailing newline characters
    urls = [url.rstrip('\n') for url in urls if not url.startswith('#')]

def parse_webpage(url: str):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find("title").get_text()
    text = soup.find("div", class_="main-wrapper")
    text.find('footer').extract()
    return [
        title,
        text.get_text(),
        url
    ]

docs = []
for url in urls:
    docs.append(parse_webpage(url))

with open('./data/littledocs.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["title", "text", "url"]

    writer.writerow(field)
    for line in docs:
        writer.writerow([line[0], line[1], line[2]])

print('DONE')