MarketLens / be /scraper.py
royzhou01's picture
Initial deploy to Hugging Face Spaces
332f271
import requests
from bs4 import BeautifulSoup
import re
import json
class ArticleScraper:
"""Web scraper for extracting full article content from news URLs"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def scrape_article(self, url, timeout=10):
"""
Scrape full article content from a news URL
Args:
url: Article URL to scrape
timeout: Request timeout in seconds
Returns:
Cleaned article text or None if scraping fails
"""
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
# Try multiple extraction methods in order of reliability
article_content = (
self._extract_by_schema(soup) or
self._extract_by_selector(soup, 'article') or
self._extract_by_selector(soup, '.article-body') or
self._extract_by_selector(soup, '.article-content') or
self._extract_by_selector(soup, '#article-content') or
self._extract_by_selector(soup, '.story-body') or
self._extract_by_selector(soup, '.entry-content') or
self._extract_paragraphs(soup)
)
if article_content:
return self._clean_text(article_content)
else:
print(f"Could not extract content from {url}")
return None
except requests.exceptions.Timeout:
print(f"Timeout scraping {url}")
return None
except requests.exceptions.RequestException as e:
print(f"Request error scraping {url}: {e}")
return None
except Exception as e:
print(f"Unexpected error scraping {url}: {e}")
return None
def _extract_by_selector(self, soup, selector):
"""Extract text from a CSS selector"""
element = soup.select_one(selector)
if element:
paragraphs = element.find_all('p')
if paragraphs:
return ' '.join(p.get_text() for p in paragraphs)
return None
def _extract_by_schema(self, soup):
"""Extract article body from JSON-LD schema.org metadata"""
script_tags = soup.find_all('script', type='application/ld+json')
for script_tag in script_tags:
try:
data = json.loads(script_tag.string)
# Handle both single objects and arrays
if isinstance(data, list):
for item in data:
if self._extract_article_body(item):
return self._extract_article_body(item)
else:
if self._extract_article_body(data):
return self._extract_article_body(data)
except (json.JSONDecodeError, AttributeError):
continue
return None
def _extract_article_body(self, data):
"""Extract articleBody from JSON-LD data"""
if isinstance(data, dict):
if data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting']:
return data.get('articleBody')
return None
def _extract_paragraphs(self, soup):
"""Fallback: Extract all paragraph tags from body"""
# Remove script, style, nav, footer, and header elements
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
element.decompose()
# Find all paragraphs
paragraphs = soup.find_all('p')
if len(paragraphs) >= 3: # Only use if we found a reasonable number of paragraphs
text = ' '.join(p.get_text() for p in paragraphs)
# Only return if we got substantial content
if len(text) > 200:
return text
return None
def _clean_text(self, text):
"""Clean and normalize extracted text"""
if not text:
return None
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove common cruft
text = re.sub(r'(Advertisement|ADVERTISEMENT)', '', text)
text = re.sub(r'(Read more:.*?\.)', '', text)
# Strip leading/trailing whitespace
text = text.strip()
# Only return if we have substantial content
if len(text) > 100:
return text
return None