File size: 5,595 Bytes
f1b19d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""

Web Fetcher Tool - Fetch and extract content from web pages

"""
import logging
from typing import Dict, Any
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils.helpers import validate_url, clean_text, format_timestamp

logger = logging.getLogger(__name__)


def fetch_web_content(url: str, extract_text_only: bool = True, timeout: int = 30) -> Dict[str, Any]:
    """

    Fetch content from a web URL.

    

    Args:

        url: URL to fetch

        extract_text_only: If True, extract only text content; if False, return HTML

        timeout: Request timeout in seconds

        

    Returns:

        Dictionary containing fetched content, status code, and metadata

    """
    try:
        import requests
        from bs4 import BeautifulSoup
        
        # Validate URL
        if not validate_url(url):
            raise ValueError(f"Invalid URL format: {url}")
        
        # Set headers to mimic a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Fetch content
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        
        content = ""
        content_type = response.headers.get('Content-Type', '')
        
        if extract_text_only and 'text/html' in content_type:
            # Parse HTML and extract text
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract title
            title = soup.title.string if soup.title else "No title"
            
            # Extract links
            links = []
            for link in soup.find_all('a', href=True):
                href = link.get('href', '')
                if href and not href.startswith('#'):
                    links.append(href)
            
            # Remove script and style elements
            for script in soup(["script", "style", "nav", "footer", "header"]):
                script.decompose()
            
            # Get text
            text = soup.get_text()
            
            # Clean up text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            content = '\n'.join(chunk for chunk in chunks if chunk)
            
            # Further clean
            content = clean_text(content)
            
        else:
            # Return raw content
            content = response.text
            title = "N/A (non-HTML content)"
            links = []
        
        # Build metadata
        metadata = {
            "url": url,
            "status_code": response.status_code,
            "content_type": content_type,
            "content_length": len(content),
            "encoding": response.encoding,
            "timestamp": format_timestamp(),
            "headers": dict(response.headers)
        }
        
        return {
            "content": content,
            "status_code": response.status_code,
            "title": title,
            "links": links,
            "metadata": metadata
        }
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Request error fetching {url}: {e}")
        raise
    except Exception as e:
        logger.error(f"Error fetching web content: {e}")
        raise


def fetch_multiple_urls(urls: list, extract_text_only: bool = True) -> list:
    """

    Fetch content from multiple URLs.

    

    Args:

        urls: List of URLs to fetch

        extract_text_only: Whether to extract text only

        

    Returns:

        List of results for each URL

    """
    results = []
    for idx, url in enumerate(urls):
        try:
            result = fetch_web_content(url, extract_text_only)
            result["index"] = idx
            result["success"] = True
            results.append(result)
        except Exception as e:
            logger.error(f"Error fetching URL at index {idx} ({url}): {e}")
            results.append({
                "index": idx,
                "url": url,
                "success": False,
                "error": str(e),
                "content": "",
                "status_code": 0
            })
    
    return results


def extract_links(url: str) -> Dict[str, Any]:
    """

    Extract all links from a web page.

    

    Args:

        url: URL to extract links from

        

    Returns:

        Dictionary with extracted links

    """
    try:
        import requests
        from bs4 import BeautifulSoup
        from urllib.parse import urljoin
        
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        links = []
        for link in soup.find_all('a', href=True):
            absolute_url = urljoin(url, link['href'])
            links.append({
                "text": link.get_text(strip=True),
                "href": absolute_url
            })
        
        return {
            "url": url,
            "total_links": len(links),
            "links": links
        }
        
    except Exception as e:
        logger.error(f"Error extracting links: {e}")
        raise