Spaces:

SmartHeal
/

NewsLetter

Sleeping

App Files Files Community

NewsLetter / utils /web_scraper.py

SmartHeal

Upload 19 files

a19173c verified 6 months ago

raw

history blame contribute delete

3.83 kB

	import trafilatura
	import requests
	import logging
	from typing import Optional

	def get_website_text_content(url: str) -> Optional[str]:
	"""
	Extract clean text content from a website URL using trafilatura.

	Args:
	url: The website URL to scrape

	Returns:
	Clean text content or None if extraction fails
	"""
	try:
	# Download the webpage
	downloaded = trafilatura.fetch_url(url)

	if not downloaded:
	logging.warning(f"Failed to download content from {url}")
	return None

	# Extract text content
	text = trafilatura.extract(downloaded)

	if not text:
	logging.warning(f"Failed to extract text from {url}")
	return None

	# Clean and validate content
	if len(text.strip()) < 50: # Too short to be useful
	logging.warning(f"Extracted content too short from {url}")
	return None

	return text.strip()

	except Exception as e:
	logging.error(f"Error extracting content from {url}: {e}")
	return None

	def extract_structured_data(url: str) -> dict:
	"""
	Extract structured data from a webpage including metadata.

	Args:
	url: The website URL to analyze

	Returns:
	Dictionary containing structured data
	"""
	try:
	downloaded = trafilatura.fetch_url(url)

	if not downloaded:
	return {'error': 'Failed to download content'}

	# Extract with metadata
	result = trafilatura.extract(
	downloaded,
	include_comments=False,
	include_tables=True,
	include_formatting=True,
	output_format='json'
	)

	if result:
	import json
	return json.loads(result)
	else:
	return {'error': 'Failed to extract structured data'}

	except Exception as e:
	logging.error(f"Error extracting structured data from {url}: {e}")
	return {'error': str(e)}

	def get_website_metadata(url: str) -> dict:
	"""
	Extract metadata from a website including title, description, etc.

	Args:
	url: The website URL to analyze

	Returns:
	Dictionary containing metadata
	"""
	try:
	response = requests.get(url, timeout=10, headers={
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	})

	if response.status_code != 200:
	return {'error': f'HTTP {response.status_code}'}

	# Use trafilatura to extract metadata
	metadata = trafilatura.extract_metadata(response.text)

	return {
	'title': metadata.title if metadata else 'No title found',
	'description': metadata.description if metadata else 'No description found',
	'author': metadata.author if metadata else 'Unknown author',
	'date': metadata.date if metadata else 'No date found',
	'url': metadata.url if metadata else url,
	'sitename': metadata.sitename if metadata else 'Unknown site'
	}

	except Exception as e:
	logging.error(f"Error extracting metadata from {url}: {e}")
	return {'error': str(e)}

	def validate_url_accessibility(url: str) -> bool:
	"""
	Check if a URL is accessible for scraping.

	Args:
	url: The URL to validate

	Returns:
	True if accessible, False otherwise
	"""
	try:
	response = requests.head(url, timeout=5, headers={
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	})
	return response.status_code == 200
	except:
	return False