Spaces:

royzhou01
/

MarketLens

Running

File size: 4,700 Bytes

332f271

import requests
from bs4 import BeautifulSoup
import re
import json


class ArticleScraper:
    """Web scraper for extracting full article content from news URLs"""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def scrape_article(self, url, timeout=10):
        """
        Scrape full article content from a news URL

        Args:
            url: Article URL to scrape
            timeout: Request timeout in seconds

        Returns:
            Cleaned article text or None if scraping fails
        """
        try:
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'lxml')

            # Try multiple extraction methods in order of reliability
            article_content = (
                self._extract_by_schema(soup) or
                self._extract_by_selector(soup, 'article') or
                self._extract_by_selector(soup, '.article-body') or
                self._extract_by_selector(soup, '.article-content') or
                self._extract_by_selector(soup, '#article-content') or
                self._extract_by_selector(soup, '.story-body') or
                self._extract_by_selector(soup, '.entry-content') or
                self._extract_paragraphs(soup)
            )

            if article_content:
                return self._clean_text(article_content)
            else:
                print(f"Could not extract content from {url}")
                return None

        except requests.exceptions.Timeout:
            print(f"Timeout scraping {url}")
            return None
        except requests.exceptions.RequestException as e:
            print(f"Request error scraping {url}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error scraping {url}: {e}")
            return None

    def _extract_by_selector(self, soup, selector):
        """Extract text from a CSS selector"""
        element = soup.select_one(selector)
        if element:
            paragraphs = element.find_all('p')
            if paragraphs:
                return ' '.join(p.get_text() for p in paragraphs)
        return None

    def _extract_by_schema(self, soup):
        """Extract article body from JSON-LD schema.org metadata"""
        script_tags = soup.find_all('script', type='application/ld+json')

        for script_tag in script_tags:
            try:
                data = json.loads(script_tag.string)

                # Handle both single objects and arrays
                if isinstance(data, list):
                    for item in data:
                        if self._extract_article_body(item):
                            return self._extract_article_body(item)
                else:
                    if self._extract_article_body(data):
                        return self._extract_article_body(data)
            except (json.JSONDecodeError, AttributeError):
                continue

        return None

    def _extract_article_body(self, data):
        """Extract articleBody from JSON-LD data"""
        if isinstance(data, dict):
            if data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting']:
                return data.get('articleBody')
        return None

    def _extract_paragraphs(self, soup):
        """Fallback: Extract all paragraph tags from body"""
        # Remove script, style, nav, footer, and header elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            element.decompose()

        # Find all paragraphs
        paragraphs = soup.find_all('p')

        if len(paragraphs) >= 3:  # Only use if we found a reasonable number of paragraphs
            text = ' '.join(p.get_text() for p in paragraphs)
            # Only return if we got substantial content
            if len(text) > 200:
                return text

        return None

    def _clean_text(self, text):
        """Clean and normalize extracted text"""
        if not text:
            return None

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove common cruft
        text = re.sub(r'(Advertisement|ADVERTISEMENT)', '', text)
        text = re.sub(r'(Read more:.*?\.)', '', text)

        # Strip leading/trailing whitespace
        text = text.strip()

        # Only return if we have substantial content
        if len(text) > 100:
            return text

        return None