Spaces:
Sleeping
Sleeping
| import trafilatura | |
| import requests | |
| from typing import Optional | |
| import time | |
| import streamlit as st | |
| def get_website_text_content(url: str) -> str: | |
| """ | |
| This function takes a url and returns the main text content of the website. | |
| The text content is extracted using trafilatura and easier to understand. | |
| The results is not directly readable, better to be summarized by LLM before consume | |
| by the user. | |
| """ | |
| try: | |
| # Send a request to the website | |
| downloaded = trafilatura.fetch_url(url) | |
| if downloaded: | |
| text = trafilatura.extract(downloaded) | |
| return text if text else "" | |
| return "" | |
| except Exception as e: | |
| st.warning(f"Failed to scrape {url}: {str(e)}") | |
| return "" | |
| def scrape_article_content(url: str) -> dict: | |
| """ | |
| Scrape article content including headline and main content. | |
| Returns a dictionary with title, content, and url. | |
| """ | |
| try: | |
| # Add a small delay to be respectful to servers | |
| time.sleep(0.5) | |
| # First try to get basic page info | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| # Extract content using trafilatura | |
| downloaded = trafilatura.fetch_url(url) | |
| if downloaded: | |
| # Extract main content | |
| content = trafilatura.extract(downloaded) | |
| # Extract metadata including title | |
| metadata = trafilatura.extract_metadata(downloaded) | |
| title = "" | |
| if metadata and hasattr(metadata, 'title') and metadata.title: | |
| title = metadata.title | |
| else: | |
| # Fallback: try to extract title from HTML | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(downloaded, 'html.parser') | |
| title_tag = soup.find('title') | |
| if title_tag: | |
| title = title_tag.get_text().strip() | |
| return { | |
| 'url': url, | |
| 'title': title or "No title found", | |
| 'content': content or "No content extracted" | |
| } | |
| else: | |
| return { | |
| 'url': url, | |
| 'title': "Failed to download", | |
| 'content': "Could not retrieve content" | |
| } | |
| except requests.RequestException as e: | |
| return { | |
| 'url': url, | |
| 'title': "Network error", | |
| 'content': f"Failed to fetch due to network error: {str(e)}" | |
| } | |
| except Exception as e: | |
| return { | |
| 'url': url, | |
| 'title': "Scraping error", | |
| 'content': f"Failed to scrape content: {str(e)}" | |
| } | |