File size: 1,302 Bytes
5d924ac
 
18cd44b
5d924ac
 
 
 
 
 
 
18cd44b
 
5d924ac
 
 
 
 
 
 
 
 
18cd44b
 
5d924ac
 
 
 
74e7eda
 
 
 
 
 
 
 
5d924ac
74e7eda
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from ddgs import DDGS
import trafilatura
import logging


def retrieve_article(url: str) -> str:
    try:
        page = trafilatura.fetch_url(url)
        return trafilatura.extract(page)
    except Exception as e:
        logging.debug(e.args)
        logging.error('Error retrieving article')
        return None
  

def get_articles(query, max_results=5, min_article_length=100) -> list[tuple[str, str]]:
    # search DuckDuckGo for news on query
    search_results = []
    try:
        search_results = DDGS().news(query, timelimit='d', max_results=max_results)
    except Exception as e:
        logging.debug(e.args)
        logging.error('Error searching for articles')

    # get article urls
    urls = set([r['url'] for r in search_results])

    # try to retrieve articles
    texts = []
    urls = []
    images = []
    for result in search_results:
        article_url = result['url']
        article_image = result['image']
        article_text = retrieve_article(article_url)

        # filter short and inaccessible articles
        if article_text and len(article_text) > min_article_length:
            texts.append(article_text)
            urls.append(article_url)
            images.append(article_image)

    return texts, urls, images