VidSimplify / manimator /inputs /url_parser.py
Adityahulk
Restoring repo state for deployment
6fc3143
Raw
History Blame Contribute Delete
1.66 kB
import logging
import requests
from bs4 import BeautifulSoup
from readability import Document
logger = logging.getLogger(__name__)
class URLParser:
"""
Extracts main content from URLs.
"""
@staticmethod
def parse(url: str) -> str:
"""
Extract main text content from a URL.
"""
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
logger.info(f"Fetching URL: {url}")
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Use readability to extract the main article content
doc = Document(response.text)
summary_html = doc.summary()
title = doc.title()
# Clean up HTML to get plain text
soup = BeautifulSoup(summary_html, 'html.parser')
text = soup.get_text(separator='\n\n')
# Clean up whitespace
clean_text = "\n".join(line.strip() for line in text.splitlines() if line.strip())
full_content = f"Title: {title}\n\n{clean_text}"
logger.info(f"Extracted {len(full_content)} characters from URL")
return full_content
except Exception as e:
logger.error(f"Error parsing URL: {e}")
raise RuntimeError(f"Failed to parse URL: {e}")