Spaces:

spamultrapromax
/

BrandScanAI

Sleeping

BrandScanAI / web_scraper.py

Arun21102003

Deployment preparation (removed binary files)

90fe073 6 days ago

2.93 kB

	import trafilatura
	import requests
	from typing import Optional
	import time
	import streamlit as st


	def get_website_text_content(url: str) -> str:
	"""
	This function takes a url and returns the main text content of the website.
	The text content is extracted using trafilatura and easier to understand.
	The results is not directly readable, better to be summarized by LLM before consume
	by the user.
	"""
	try:
	# Send a request to the website
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	text = trafilatura.extract(downloaded)
	return text if text else ""
	return ""
	except Exception as e:
	st.warning(f"Failed to scrape {url}: {str(e)}")
	return ""


	def scrape_article_content(url: str) -> dict:
	"""
	Scrape article content including headline and main content.
	Returns a dictionary with title, content, and url.
	"""
	try:
	# Add a small delay to be respectful to servers
	time.sleep(0.5)

	# First try to get basic page info
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	# Extract content using trafilatura
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	# Extract main content
	content = trafilatura.extract(downloaded)
	# Extract metadata including title
	metadata = trafilatura.extract_metadata(downloaded)

	title = ""
	if metadata and hasattr(metadata, 'title') and metadata.title:
	title = metadata.title
	else:
	# Fallback: try to extract title from HTML
	from bs4 import BeautifulSoup
	soup = BeautifulSoup(downloaded, 'html.parser')
	title_tag = soup.find('title')
	if title_tag:
	title = title_tag.get_text().strip()

	return {
	'url': url,
	'title': title or "No title found",
	'content': content or "No content extracted"
	}
	else:
	return {
	'url': url,
	'title': "Failed to download",
	'content': "Could not retrieve content"
	}

	except requests.RequestException as e:
	return {
	'url': url,
	'title': "Network error",
	'content': f"Failed to fetch due to network error: {str(e)}"
	}
	except Exception as e:
	return {
	'url': url,
	'title': "Scraping error",
	'content': f"Failed to scrape content: {str(e)}"
	}