import trafilatura import requests from typing import Optional import time import streamlit as st def get_website_text_content(url: str) -> str: """ This function takes a url and returns the main text content of the website. The text content is extracted using trafilatura and easier to understand. The results is not directly readable, better to be summarized by LLM before consume by the user. """ try: # Send a request to the website downloaded = trafilatura.fetch_url(url) if downloaded: text = trafilatura.extract(downloaded) return text if text else "" return "" except Exception as e: st.warning(f"Failed to scrape {url}: {str(e)}") return "" def scrape_article_content(url: str) -> dict: """ Scrape article content including headline and main content. Returns a dictionary with title, content, and url. """ try: # Add a small delay to be respectful to servers time.sleep(0.5) # First try to get basic page info headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Extract content using trafilatura downloaded = trafilatura.fetch_url(url) if downloaded: # Extract main content content = trafilatura.extract(downloaded) # Extract metadata including title metadata = trafilatura.extract_metadata(downloaded) title = "" if metadata and hasattr(metadata, 'title') and metadata.title: title = metadata.title else: # Fallback: try to extract title from HTML from bs4 import BeautifulSoup soup = BeautifulSoup(downloaded, 'html.parser') title_tag = soup.find('title') if title_tag: title = title_tag.get_text().strip() return { 'url': url, 'title': title or "No title found", 'content': content or "No content extracted" } else: return { 'url': url, 'title': "Failed to download", 'content': "Could not retrieve content" } except requests.RequestException as e: return { 'url': url, 'title': "Network error", 'content': f"Failed to fetch due to network error: {str(e)}" } except Exception as e: return { 'url': url, 'title': "Scraping error", 'content': f"Failed to scrape content: {str(e)}" }