Text_Summarizer / url_input.py
Sneha-Kaurav's picture
Create url_input.py
d1696ce verified
raw
history blame contribute delete
758 Bytes
import requests
from bs4 import BeautifulSoup
import re
def fetch_text_from_url(url):
"""
Fetches and cleans main content from a URL.
Returns plain text or None on error.
"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except requests.RequestException as e:
print(f"[ERROR] Could not retrieve URL: {e}")
return None
soup = BeautifulSoup(response.content, 'html.parser')
for tag in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
text = re.sub(r'\s+', ' ', text).strip()
if len(text) > 3000: # distilBART safe limit
text = text[:3000]
return text