Spaces:
No application file
No application file
| """ | |
| Instalar os pacotes, assim: | |
| $ poetry install | |
| """ | |
| import time | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from trafilatura.sitemaps import sitemap_search | |
| from trafilatura import fetch_url, extract, extract_metadata | |
| def get_urls_from_sitemap(resource_url: str) -> list: | |
| """ | |
| Função que cria um DataFrame Pandas de URLs e artigos. | |
| """ | |
| urls = sitemap_search(resource_url) | |
| return urls | |
| def extract_article(url: str) -> dict: | |
| """ | |
| Estrae un articolo da una URL con Trafilatura | |
| """ | |
| downloaded = fetch_url(url) | |
| article = extract(downloaded, favor_precision=True, only_with_metadata=True) | |
| metadata = extract_metadata(downloaded) | |
| return article, metadata | |
| def create_dataset(list_of_websites: list) -> pd.DataFrame: | |
| """ | |
| Funzione che crea un DataFrame Pandas di URL e articoli. | |
| """ | |
| data = [] | |
| for website in tqdm(list_of_websites, desc="Websites"): | |
| urls = get_urls_from_sitemap(website) | |
| for url in tqdm(urls, desc="URLs"): | |
| article, metadata = extract_article(url) | |
| d = { | |
| "url": url, | |
| "article": article, | |
| "title": metadata.title, | |
| "description": metadata.description, | |
| "author": metadata.author, | |
| "date": metadata.date, | |
| } | |
| data.append(d) | |
| time.sleep(0.5) | |
| df = pd.DataFrame(data) | |
| df = df.drop_duplicates() | |
| df = df.dropna() | |
| return df | |
| if __name__ == "__main__": | |
| list_of_websites = [ | |
| "https://www.diariodiunanalista.it/", | |
| ] | |
| df = create_dataset(list_of_websites) | |
| df.to_csv("./data/articles.csv", index=False) | |