File size: 3,834 Bytes
a19173c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import trafilatura
import requests
import logging
from typing import Optional

def get_website_text_content(url: str) -> Optional[str]:
    """
    Extract clean text content from a website URL using trafilatura.
    
    Args:
        url: The website URL to scrape
        
    Returns:
        Clean text content or None if extraction fails
    """
    try:
        # Download the webpage
        downloaded = trafilatura.fetch_url(url)
        
        if not downloaded:
            logging.warning(f"Failed to download content from {url}")
            return None
        
        # Extract text content
        text = trafilatura.extract(downloaded)
        
        if not text:
            logging.warning(f"Failed to extract text from {url}")
            return None
        
        # Clean and validate content
        if len(text.strip()) < 50:  # Too short to be useful
            logging.warning(f"Extracted content too short from {url}")
            return None
            
        return text.strip()
        
    except Exception as e:
        logging.error(f"Error extracting content from {url}: {e}")
        return None

def extract_structured_data(url: str) -> dict:
    """
    Extract structured data from a webpage including metadata.
    
    Args:
        url: The website URL to analyze
        
    Returns:
        Dictionary containing structured data
    """
    try:
        downloaded = trafilatura.fetch_url(url)
        
        if not downloaded:
            return {'error': 'Failed to download content'}
        
        # Extract with metadata
        result = trafilatura.extract(
            downloaded, 
            include_comments=False,
            include_tables=True,
            include_formatting=True,
            output_format='json'
        )
        
        if result:
            import json
            return json.loads(result)
        else:
            return {'error': 'Failed to extract structured data'}
            
    except Exception as e:
        logging.error(f"Error extracting structured data from {url}: {e}")
        return {'error': str(e)}

def get_website_metadata(url: str) -> dict:
    """
    Extract metadata from a website including title, description, etc.
    
    Args:
        url: The website URL to analyze
        
    Returns:
        Dictionary containing metadata
    """
    try:
        response = requests.get(url, timeout=10, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        if response.status_code != 200:
            return {'error': f'HTTP {response.status_code}'}
        
        # Use trafilatura to extract metadata
        metadata = trafilatura.extract_metadata(response.text)
        
        return {
            'title': metadata.title if metadata else 'No title found',
            'description': metadata.description if metadata else 'No description found',
            'author': metadata.author if metadata else 'Unknown author',
            'date': metadata.date if metadata else 'No date found',
            'url': metadata.url if metadata else url,
            'sitename': metadata.sitename if metadata else 'Unknown site'
        }
        
    except Exception as e:
        logging.error(f"Error extracting metadata from {url}: {e}")
        return {'error': str(e)}

def validate_url_accessibility(url: str) -> bool:
    """
    Check if a URL is accessible for scraping.
    
    Args:
        url: The URL to validate
        
    Returns:
        True if accessible, False otherwise
    """
    try:
        response = requests.head(url, timeout=5, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        return response.status_code == 200
    except:
        return False