File size: 2,925 Bytes
90fe073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import trafilatura
import requests
from typing import Optional
import time
import streamlit as st


def get_website_text_content(url: str) -> str:
    """
    This function takes a url and returns the main text content of the website.
    The text content is extracted using trafilatura and easier to understand.
    The results is not directly readable, better to be summarized by LLM before consume
    by the user.
    """
    try:
        # Send a request to the website
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            text = trafilatura.extract(downloaded)
            return text if text else ""
        return ""
    except Exception as e:
        st.warning(f"Failed to scrape {url}: {str(e)}")
        return ""


def scrape_article_content(url: str) -> dict:
    """
    Scrape article content including headline and main content.
    Returns a dictionary with title, content, and url.
    """
    try:
        # Add a small delay to be respectful to servers
        time.sleep(0.5)
        
        # First try to get basic page info
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Extract content using trafilatura
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            # Extract main content
            content = trafilatura.extract(downloaded)
            # Extract metadata including title
            metadata = trafilatura.extract_metadata(downloaded)
            
            title = ""
            if metadata and hasattr(metadata, 'title') and metadata.title:
                title = metadata.title
            else:
                # Fallback: try to extract title from HTML
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(downloaded, 'html.parser')
                title_tag = soup.find('title')
                if title_tag:
                    title = title_tag.get_text().strip()
            
            return {
                'url': url,
                'title': title or "No title found",
                'content': content or "No content extracted"
            }
        else:
            return {
                'url': url,
                'title': "Failed to download",
                'content': "Could not retrieve content"
            }
            
    except requests.RequestException as e:
        return {
            'url': url,
            'title': "Network error",
            'content': f"Failed to fetch due to network error: {str(e)}"
        }
    except Exception as e:
        return {
            'url': url,
            'title': "Scraping error",
            'content': f"Failed to scrape content: {str(e)}"
        }