Spaces:
Running
Running
File size: 1,302 Bytes
5d924ac 18cd44b 5d924ac 18cd44b 5d924ac 18cd44b 5d924ac 74e7eda 5d924ac 74e7eda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from ddgs import DDGS
import trafilatura
import logging
def retrieve_article(url: str) -> str:
try:
page = trafilatura.fetch_url(url)
return trafilatura.extract(page)
except Exception as e:
logging.debug(e.args)
logging.error('Error retrieving article')
return None
def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
# search DuckDuckGo for news on query
search_results = []
try:
search_results = DDGS().news(query, timelimit='d', max_results=max_results)
except Exception as e:
logging.debug(e.args)
logging.error('Error searching for articles')
# get article urls
urls = set([r['url'] for r in search_results])
# try to retrieve articles
texts = []
urls = []
images = []
for result in search_results:
article_url = result['url']
article_image = result['image']
article_text = retrieve_article(article_url)
# filter short and inaccessible articles
if article_text and len(article_text) > min_article_length:
texts.append(article_text)
urls.append(article_url)
images.append(article_image)
return texts, urls, images |