Spaces:
Running
Running
| from ddgs import DDGS | |
| import trafilatura | |
| import logging | |
| def retrieve_article(url: str) -> str: | |
| try: | |
| page = trafilatura.fetch_url(url) | |
| return trafilatura.extract(page) | |
| except Exception as e: | |
| logging.debug(e.args) | |
| logging.error('Error retrieving article') | |
| return None | |
| def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]: | |
| # search DuckDuckGo for news on query | |
| search_results = [] | |
| try: | |
| search_results = DDGS().news(query, timelimit='d', max_results=max_results) | |
| except Exception as e: | |
| logging.debug(e.args) | |
| logging.error('Error searching for articles') | |
| # get article urls | |
| urls = set([r['url'] for r in search_results]) | |
| # try to retrieve articles | |
| texts = [] | |
| urls = [] | |
| images = [] | |
| for result in search_results: | |
| article_url = result['url'] | |
| article_image = result['image'] | |
| article_text = retrieve_article(article_url) | |
| # filter short and inaccessible articles | |
| if article_text and len(article_text) > min_article_length: | |
| texts.append(article_text) | |
| urls.append(article_url) | |
| images.append(article_image) | |
| return texts, urls, images |