Spaces:
Running
Running
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import json | |
| class ArticleScraper: | |
| """Web scraper for extracting full article content from news URLs""" | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| }) | |
| def scrape_article(self, url, timeout=10): | |
| """ | |
| Scrape full article content from a news URL | |
| Args: | |
| url: Article URL to scrape | |
| timeout: Request timeout in seconds | |
| Returns: | |
| Cleaned article text or None if scraping fails | |
| """ | |
| try: | |
| response = self.session.get(url, timeout=timeout) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'lxml') | |
| # Try multiple extraction methods in order of reliability | |
| article_content = ( | |
| self._extract_by_schema(soup) or | |
| self._extract_by_selector(soup, 'article') or | |
| self._extract_by_selector(soup, '.article-body') or | |
| self._extract_by_selector(soup, '.article-content') or | |
| self._extract_by_selector(soup, '#article-content') or | |
| self._extract_by_selector(soup, '.story-body') or | |
| self._extract_by_selector(soup, '.entry-content') or | |
| self._extract_paragraphs(soup) | |
| ) | |
| if article_content: | |
| return self._clean_text(article_content) | |
| else: | |
| print(f"Could not extract content from {url}") | |
| return None | |
| except requests.exceptions.Timeout: | |
| print(f"Timeout scraping {url}") | |
| return None | |
| except requests.exceptions.RequestException as e: | |
| print(f"Request error scraping {url}: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"Unexpected error scraping {url}: {e}") | |
| return None | |
| def _extract_by_selector(self, soup, selector): | |
| """Extract text from a CSS selector""" | |
| element = soup.select_one(selector) | |
| if element: | |
| paragraphs = element.find_all('p') | |
| if paragraphs: | |
| return ' '.join(p.get_text() for p in paragraphs) | |
| return None | |
| def _extract_by_schema(self, soup): | |
| """Extract article body from JSON-LD schema.org metadata""" | |
| script_tags = soup.find_all('script', type='application/ld+json') | |
| for script_tag in script_tags: | |
| try: | |
| data = json.loads(script_tag.string) | |
| # Handle both single objects and arrays | |
| if isinstance(data, list): | |
| for item in data: | |
| if self._extract_article_body(item): | |
| return self._extract_article_body(item) | |
| else: | |
| if self._extract_article_body(data): | |
| return self._extract_article_body(data) | |
| except (json.JSONDecodeError, AttributeError): | |
| continue | |
| return None | |
| def _extract_article_body(self, data): | |
| """Extract articleBody from JSON-LD data""" | |
| if isinstance(data, dict): | |
| if data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting']: | |
| return data.get('articleBody') | |
| return None | |
| def _extract_paragraphs(self, soup): | |
| """Fallback: Extract all paragraph tags from body""" | |
| # Remove script, style, nav, footer, and header elements | |
| for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): | |
| element.decompose() | |
| # Find all paragraphs | |
| paragraphs = soup.find_all('p') | |
| if len(paragraphs) >= 3: # Only use if we found a reasonable number of paragraphs | |
| text = ' '.join(p.get_text() for p in paragraphs) | |
| # Only return if we got substantial content | |
| if len(text) > 200: | |
| return text | |
| return None | |
| def _clean_text(self, text): | |
| """Clean and normalize extracted text""" | |
| if not text: | |
| return None | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove common cruft | |
| text = re.sub(r'(Advertisement|ADVERTISEMENT)', '', text) | |
| text = re.sub(r'(Read more:.*?\.)', '', text) | |
| # Strip leading/trailing whitespace | |
| text = text.strip() | |
| # Only return if we have substantial content | |
| if len(text) > 100: | |
| return text | |
| return None | |