Spaces:
Sleeping
Sleeping
File size: 3,938 Bytes
52a0fe9 a2aa7c3 52a0fe9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | """
Web content extraction from URLs using requests and BeautifulSoup4.
Extracts title and main text content from HTML pages.
"""
import time
import requests
from bs4 import BeautifulSoup
from models.schemas import ExtractionResult, DocumentMetadata
def extract_url(url: str) -> ExtractionResult:
"""Fetch and extract text content from a web URL."""
start_time = time.time()
try:
# 1. Fetch content
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
response = requests.get(url, headers=headers, timeout=10, verify=False)
response.raise_for_status()
# 2. Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 3. Remove script and style elements
for script_or_style in soup(["script", "style", "nav", "footer", "header", "aside"]):
script_or_style.decompose()
# 4. Get text
# Try to find the title
title = soup.title.string.strip() if soup.title else url
# Get main text - simple heuristic: look for <article> or just <body>
content_area = soup.find('article') or soup.body
if not content_area:
content_area = soup
# Extract text while preserving some paragraph structure
lines = []
for element in content_area.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'li']):
text = element.get_text().strip()
if text:
if element.name.startswith('h'):
prefix = '#' * int(element.name[1])
lines.append(f"\n{prefix} {text}\n")
else:
lines.append(text)
full_text = "\n\n".join(lines)
if not full_text.strip():
# Fallback to general text extraction
full_text = soup.get_text(separator='\n\n', strip=True)
# 5. Build metadata
metadata = DocumentMetadata(
title=title,
author="Web Content",
creation_date="",
modification_date="",
page_count=None,
word_count=len(full_text.split()),
character_count=len(full_text),
file_type="URL",
extra={
"url": url,
"domain": url.split('/')[2] if '//' in url else url.split('/')[0],
"status_code": response.status_code,
"content_type": response.headers.get('Content-Type', '')
}
)
elapsed = (time.time() - start_time) * 1000
if not full_text.strip():
return ExtractionResult(
raw_text="",
metadata=metadata,
success=False,
error_message="Could not extract any meaningful text from the provided URL.",
extraction_time_ms=elapsed,
)
return ExtractionResult(
raw_text=full_text,
metadata=metadata,
success=True,
extraction_time_ms=elapsed,
)
except requests.exceptions.RequestException as e:
elapsed = (time.time() - start_time) * 1000
return ExtractionResult(
raw_text="",
metadata=DocumentMetadata(file_type="URL", extra={"url": url}),
success=False,
error_message=f"Failed to fetch URL: {str(e)}",
extraction_time_ms=elapsed,
)
except Exception as e:
elapsed = (time.time() - start_time) * 1000
return ExtractionResult(
raw_text="",
metadata=DocumentMetadata(file_type="URL", extra={"url": url}),
success=False,
error_message=f"Web extraction failed: {str(e)}",
extraction_time_ms=elapsed,
)
|