Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| def fetch_text_from_url(url): | |
| """ | |
| Fetches and cleans main content from a URL. | |
| Returns plain text or None on error. | |
| """ | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f"[ERROR] Could not retrieve URL: {e}") | |
| return None | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| for tag in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']): | |
| tag.decompose() | |
| text = soup.get_text(separator=' ', strip=True) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| if len(text) > 3000: # distilBART safe limit | |
| text = text[:3000] | |
| return text | |