Spaces:
Build error
Build error
File size: 865 Bytes
257dcc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import csv
import requests
from bs4 import BeautifulSoup
with open("./data/littlejsdocs.txt") as file:
urls = file.readlines()
# Remove trailing newline characters
urls = [url.rstrip('\n') for url in urls if not url.startswith('#')]
def parse_webpage(url: str):
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").get_text()
text = soup.find("div", class_="main-wrapper")
text.find('footer').extract()
return [
title,
text.get_text(),
url
]
docs = []
for url in urls:
docs.append(parse_webpage(url))
with open('./data/littledocs.csv', 'w', newline='') as file:
writer = csv.writer(file)
field = ["title", "text", "url"]
writer.writerow(field)
for line in docs:
writer.writerow([line[0], line[1], line[2]])
print('DONE')
|