import trafilatura import requests import logging from typing import Optional def get_website_text_content(url: str) -> Optional[str]: """ Extract clean text content from a website URL using trafilatura. Args: url: The website URL to scrape Returns: Clean text content or None if extraction fails """ try: # Download the webpage downloaded = trafilatura.fetch_url(url) if not downloaded: logging.warning(f"Failed to download content from {url}") return None # Extract text content text = trafilatura.extract(downloaded) if not text: logging.warning(f"Failed to extract text from {url}") return None # Clean and validate content if len(text.strip()) < 50: # Too short to be useful logging.warning(f"Extracted content too short from {url}") return None return text.strip() except Exception as e: logging.error(f"Error extracting content from {url}: {e}") return None def extract_structured_data(url: str) -> dict: """ Extract structured data from a webpage including metadata. Args: url: The website URL to analyze Returns: Dictionary containing structured data """ try: downloaded = trafilatura.fetch_url(url) if not downloaded: return {'error': 'Failed to download content'} # Extract with metadata result = trafilatura.extract( downloaded, include_comments=False, include_tables=True, include_formatting=True, output_format='json' ) if result: import json return json.loads(result) else: return {'error': 'Failed to extract structured data'} except Exception as e: logging.error(f"Error extracting structured data from {url}: {e}") return {'error': str(e)} def get_website_metadata(url: str) -> dict: """ Extract metadata from a website including title, description, etc. Args: url: The website URL to analyze Returns: Dictionary containing metadata """ try: response = requests.get(url, timeout=10, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) if response.status_code != 200: return {'error': f'HTTP {response.status_code}'} # Use trafilatura to extract metadata metadata = trafilatura.extract_metadata(response.text) return { 'title': metadata.title if metadata else 'No title found', 'description': metadata.description if metadata else 'No description found', 'author': metadata.author if metadata else 'Unknown author', 'date': metadata.date if metadata else 'No date found', 'url': metadata.url if metadata else url, 'sitename': metadata.sitename if metadata else 'Unknown site' } except Exception as e: logging.error(f"Error extracting metadata from {url}: {e}") return {'error': str(e)} def validate_url_accessibility(url: str) -> bool: """ Check if a URL is accessible for scraping. Args: url: The URL to validate Returns: True if accessible, False otherwise """ try: response = requests.head(url, timeout=5, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) return response.status_code == 200 except: return False