BrandScanAI / web_scraper.py
Arun21102003
Deployment preparation (removed binary files)
90fe073
import trafilatura
import requests
from typing import Optional
import time
import streamlit as st
def get_website_text_content(url: str) -> str:
"""
This function takes a url and returns the main text content of the website.
The text content is extracted using trafilatura and easier to understand.
The results is not directly readable, better to be summarized by LLM before consume
by the user.
"""
try:
# Send a request to the website
downloaded = trafilatura.fetch_url(url)
if downloaded:
text = trafilatura.extract(downloaded)
return text if text else ""
return ""
except Exception as e:
st.warning(f"Failed to scrape {url}: {str(e)}")
return ""
def scrape_article_content(url: str) -> dict:
"""
Scrape article content including headline and main content.
Returns a dictionary with title, content, and url.
"""
try:
# Add a small delay to be respectful to servers
time.sleep(0.5)
# First try to get basic page info
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Extract content using trafilatura
downloaded = trafilatura.fetch_url(url)
if downloaded:
# Extract main content
content = trafilatura.extract(downloaded)
# Extract metadata including title
metadata = trafilatura.extract_metadata(downloaded)
title = ""
if metadata and hasattr(metadata, 'title') and metadata.title:
title = metadata.title
else:
# Fallback: try to extract title from HTML
from bs4 import BeautifulSoup
soup = BeautifulSoup(downloaded, 'html.parser')
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text().strip()
return {
'url': url,
'title': title or "No title found",
'content': content or "No content extracted"
}
else:
return {
'url': url,
'title': "Failed to download",
'content': "Could not retrieve content"
}
except requests.RequestException as e:
return {
'url': url,
'title': "Network error",
'content': f"Failed to fetch due to network error: {str(e)}"
}
except Exception as e:
return {
'url': url,
'title': "Scraping error",
'content': f"Failed to scrape content: {str(e)}"
}