Spaces:
Sleeping
Sleeping
| import trafilatura | |
| import requests | |
| import logging | |
| from typing import Optional | |
| def get_website_text_content(url: str) -> Optional[str]: | |
| """ | |
| Extract clean text content from a website URL using trafilatura. | |
| Args: | |
| url: The website URL to scrape | |
| Returns: | |
| Clean text content or None if extraction fails | |
| """ | |
| try: | |
| # Download the webpage | |
| downloaded = trafilatura.fetch_url(url) | |
| if not downloaded: | |
| logging.warning(f"Failed to download content from {url}") | |
| return None | |
| # Extract text content | |
| text = trafilatura.extract(downloaded) | |
| if not text: | |
| logging.warning(f"Failed to extract text from {url}") | |
| return None | |
| # Clean and validate content | |
| if len(text.strip()) < 50: # Too short to be useful | |
| logging.warning(f"Extracted content too short from {url}") | |
| return None | |
| return text.strip() | |
| except Exception as e: | |
| logging.error(f"Error extracting content from {url}: {e}") | |
| return None | |
| def extract_structured_data(url: str) -> dict: | |
| """ | |
| Extract structured data from a webpage including metadata. | |
| Args: | |
| url: The website URL to analyze | |
| Returns: | |
| Dictionary containing structured data | |
| """ | |
| try: | |
| downloaded = trafilatura.fetch_url(url) | |
| if not downloaded: | |
| return {'error': 'Failed to download content'} | |
| # Extract with metadata | |
| result = trafilatura.extract( | |
| downloaded, | |
| include_comments=False, | |
| include_tables=True, | |
| include_formatting=True, | |
| output_format='json' | |
| ) | |
| if result: | |
| import json | |
| return json.loads(result) | |
| else: | |
| return {'error': 'Failed to extract structured data'} | |
| except Exception as e: | |
| logging.error(f"Error extracting structured data from {url}: {e}") | |
| return {'error': str(e)} | |
| def get_website_metadata(url: str) -> dict: | |
| """ | |
| Extract metadata from a website including title, description, etc. | |
| Args: | |
| url: The website URL to analyze | |
| Returns: | |
| Dictionary containing metadata | |
| """ | |
| try: | |
| response = requests.get(url, timeout=10, headers={ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| }) | |
| if response.status_code != 200: | |
| return {'error': f'HTTP {response.status_code}'} | |
| # Use trafilatura to extract metadata | |
| metadata = trafilatura.extract_metadata(response.text) | |
| return { | |
| 'title': metadata.title if metadata else 'No title found', | |
| 'description': metadata.description if metadata else 'No description found', | |
| 'author': metadata.author if metadata else 'Unknown author', | |
| 'date': metadata.date if metadata else 'No date found', | |
| 'url': metadata.url if metadata else url, | |
| 'sitename': metadata.sitename if metadata else 'Unknown site' | |
| } | |
| except Exception as e: | |
| logging.error(f"Error extracting metadata from {url}: {e}") | |
| return {'error': str(e)} | |
| def validate_url_accessibility(url: str) -> bool: | |
| """ | |
| Check if a URL is accessible for scraping. | |
| Args: | |
| url: The URL to validate | |
| Returns: | |
| True if accessible, False otherwise | |
| """ | |
| try: | |
| response = requests.head(url, timeout=5, headers={ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| }) | |
| return response.status_code == 200 | |
| except: | |
| return False | |