focal / app /scraper.py
michaelkri
Article photos
74e7eda
from ddgs import DDGS
import trafilatura
import logging
def retrieve_article(url: str) -> str:
try:
page = trafilatura.fetch_url(url)
return trafilatura.extract(page)
except Exception as e:
logging.debug(e.args)
logging.error('Error retrieving article')
return None
def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
# search DuckDuckGo for news on query
search_results = []
try:
search_results = DDGS().news(query, timelimit='d', max_results=max_results)
except Exception as e:
logging.debug(e.args)
logging.error('Error searching for articles')
# get article urls
urls = set([r['url'] for r in search_results])
# try to retrieve articles
texts = []
urls = []
images = []
for result in search_results:
article_url = result['url']
article_image = result['image']
article_text = retrieve_article(article_url)
# filter short and inaccessible articles
if article_text and len(article_text) > min_article_length:
texts.append(article_text)
urls.append(article_url)
images.append(article_image)
return texts, urls, images