File size: 3,938 Bytes
52a0fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2aa7c3
 
 
52a0fe9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
Web content extraction from URLs using requests and BeautifulSoup4.
Extracts title and main text content from HTML pages.
"""
import time
import requests
from bs4 import BeautifulSoup
from models.schemas import ExtractionResult, DocumentMetadata


def extract_url(url: str) -> ExtractionResult:
    """Fetch and extract text content from a web URL."""
    start_time = time.time()

    try:
        # 1. Fetch content
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        import urllib3
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        response = requests.get(url, headers=headers, timeout=10, verify=False)
        response.raise_for_status()

        # 2. Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # 3. Remove script and style elements
        for script_or_style in soup(["script", "style", "nav", "footer", "header", "aside"]):
            script_or_style.decompose()

        # 4. Get text
        # Try to find the title
        title = soup.title.string.strip() if soup.title else url

        # Get main text - simple heuristic: look for <article> or just <body>
        content_area = soup.find('article') or soup.body
        if not content_area:
             content_area = soup

        # Extract text while preserving some paragraph structure
        lines = []
        for element in content_area.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'li']):
            text = element.get_text().strip()
            if text:
                if element.name.startswith('h'):
                    prefix = '#' * int(element.name[1])
                    lines.append(f"\n{prefix} {text}\n")
                else:
                    lines.append(text)

        full_text = "\n\n".join(lines)
        if not full_text.strip():
            # Fallback to general text extraction
            full_text = soup.get_text(separator='\n\n', strip=True)

        # 5. Build metadata
        metadata = DocumentMetadata(
            title=title,
            author="Web Content",
            creation_date="",
            modification_date="",
            page_count=None,
            word_count=len(full_text.split()),
            character_count=len(full_text),
            file_type="URL",
            extra={
                "url": url,
                "domain": url.split('/')[2] if '//' in url else url.split('/')[0],
                "status_code": response.status_code,
                "content_type": response.headers.get('Content-Type', '')
            }
        )

        elapsed = (time.time() - start_time) * 1000

        if not full_text.strip():
            return ExtractionResult(
                raw_text="",
                metadata=metadata,
                success=False,
                error_message="Could not extract any meaningful text from the provided URL.",
                extraction_time_ms=elapsed,
            )

        return ExtractionResult(
            raw_text=full_text,
            metadata=metadata,
            success=True,
            extraction_time_ms=elapsed,
        )

    except requests.exceptions.RequestException as e:
        elapsed = (time.time() - start_time) * 1000
        return ExtractionResult(
            raw_text="",
            metadata=DocumentMetadata(file_type="URL", extra={"url": url}),
            success=False,
            error_message=f"Failed to fetch URL: {str(e)}",
            extraction_time_ms=elapsed,
        )
    except Exception as e:
        elapsed = (time.time() - start_time) * 1000
        return ExtractionResult(
            raw_text="",
            metadata=DocumentMetadata(file_type="URL", extra={"url": url}),
            success=False,
            error_message=f"Web extraction failed: {str(e)}",
            extraction_time_ms=elapsed,
        )