Spaces:

SmartHeal
/

NewsLetter

Sleeping

File size: 3,834 Bytes

a19173c

import trafilatura
import requests
import logging
from typing import Optional

def get_website_text_content(url: str) -> Optional[str]:
    """
    Extract clean text content from a website URL using trafilatura.
    
    Args:
        url: The website URL to scrape
        
    Returns:
        Clean text content or None if extraction fails
    """
    try:
        # Download the webpage
        downloaded = trafilatura.fetch_url(url)
        
        if not downloaded:
            logging.warning(f"Failed to download content from {url}")
            return None
        
        # Extract text content
        text = trafilatura.extract(downloaded)
        
        if not text:
            logging.warning(f"Failed to extract text from {url}")
            return None
        
        # Clean and validate content
        if len(text.strip()) < 50:  # Too short to be useful
            logging.warning(f"Extracted content too short from {url}")
            return None
            
        return text.strip()
        
    except Exception as e:
        logging.error(f"Error extracting content from {url}: {e}")
        return None

def extract_structured_data(url: str) -> dict:
    """
    Extract structured data from a webpage including metadata.
    
    Args:
        url: The website URL to analyze
        
    Returns:
        Dictionary containing structured data
    """
    try:
        downloaded = trafilatura.fetch_url(url)
        
        if not downloaded:
            return {'error': 'Failed to download content'}
        
        # Extract with metadata
        result = trafilatura.extract(
            downloaded, 
            include_comments=False,
            include_tables=True,
            include_formatting=True,
            output_format='json'
        )
        
        if result:
            import json
            return json.loads(result)
        else:
            return {'error': 'Failed to extract structured data'}
            
    except Exception as e:
        logging.error(f"Error extracting structured data from {url}: {e}")
        return {'error': str(e)}

def get_website_metadata(url: str) -> dict:
    """
    Extract metadata from a website including title, description, etc.
    
    Args:
        url: The website URL to analyze
        
    Returns:
        Dictionary containing metadata
    """
    try:
        response = requests.get(url, timeout=10, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        if response.status_code != 200:
            return {'error': f'HTTP {response.status_code}'}
        
        # Use trafilatura to extract metadata
        metadata = trafilatura.extract_metadata(response.text)
        
        return {
            'title': metadata.title if metadata else 'No title found',
            'description': metadata.description if metadata else 'No description found',
            'author': metadata.author if metadata else 'Unknown author',
            'date': metadata.date if metadata else 'No date found',
            'url': metadata.url if metadata else url,
            'sitename': metadata.sitename if metadata else 'Unknown site'
        }
        
    except Exception as e:
        logging.error(f"Error extracting metadata from {url}: {e}")
        return {'error': str(e)}

def validate_url_accessibility(url: str) -> bool:
    """
    Check if a URL is accessible for scraping.
    
    Args:
        url: The URL to validate
        
    Returns:
        True if accessible, False otherwise
    """
    try:
        response = requests.head(url, timeout=5, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        return response.status_code == 200
    except:
        return False