from ddgs import DDGS import trafilatura import logging def retrieve_article(url: str) -> str: try: page = trafilatura.fetch_url(url) return trafilatura.extract(page) except Exception as e: logging.debug(e.args) logging.error('Error retrieving article') return None def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]: # search DuckDuckGo for news on query search_results = [] try: search_results = DDGS().news(query, timelimit='d', max_results=max_results) except Exception as e: logging.debug(e.args) logging.error('Error searching for articles') # get article urls urls = set([r['url'] for r in search_results]) # try to retrieve articles texts = [] urls = [] images = [] for result in search_results: article_url = result['url'] article_image = result['image'] article_text = retrieve_article(article_url) # filter short and inaccessible articles if article_text and len(article_text) > min_article_length: texts.append(article_text) urls.append(article_url) images.append(article_image) return texts, urls, images