File size: 4,700 Bytes
332f271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import requests
from bs4 import BeautifulSoup
import re
import json


class ArticleScraper:
    """Web scraper for extracting full article content from news URLs"""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def scrape_article(self, url, timeout=10):
        """
        Scrape full article content from a news URL

        Args:
            url: Article URL to scrape
            timeout: Request timeout in seconds

        Returns:
            Cleaned article text or None if scraping fails
        """
        try:
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'lxml')

            # Try multiple extraction methods in order of reliability
            article_content = (
                self._extract_by_schema(soup) or
                self._extract_by_selector(soup, 'article') or
                self._extract_by_selector(soup, '.article-body') or
                self._extract_by_selector(soup, '.article-content') or
                self._extract_by_selector(soup, '#article-content') or
                self._extract_by_selector(soup, '.story-body') or
                self._extract_by_selector(soup, '.entry-content') or
                self._extract_paragraphs(soup)
            )

            if article_content:
                return self._clean_text(article_content)
            else:
                print(f"Could not extract content from {url}")
                return None

        except requests.exceptions.Timeout:
            print(f"Timeout scraping {url}")
            return None
        except requests.exceptions.RequestException as e:
            print(f"Request error scraping {url}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error scraping {url}: {e}")
            return None

    def _extract_by_selector(self, soup, selector):
        """Extract text from a CSS selector"""
        element = soup.select_one(selector)
        if element:
            paragraphs = element.find_all('p')
            if paragraphs:
                return ' '.join(p.get_text() for p in paragraphs)
        return None

    def _extract_by_schema(self, soup):
        """Extract article body from JSON-LD schema.org metadata"""
        script_tags = soup.find_all('script', type='application/ld+json')

        for script_tag in script_tags:
            try:
                data = json.loads(script_tag.string)

                # Handle both single objects and arrays
                if isinstance(data, list):
                    for item in data:
                        if self._extract_article_body(item):
                            return self._extract_article_body(item)
                else:
                    if self._extract_article_body(data):
                        return self._extract_article_body(data)
            except (json.JSONDecodeError, AttributeError):
                continue

        return None

    def _extract_article_body(self, data):
        """Extract articleBody from JSON-LD data"""
        if isinstance(data, dict):
            if data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting']:
                return data.get('articleBody')
        return None

    def _extract_paragraphs(self, soup):
        """Fallback: Extract all paragraph tags from body"""
        # Remove script, style, nav, footer, and header elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            element.decompose()

        # Find all paragraphs
        paragraphs = soup.find_all('p')

        if len(paragraphs) >= 3:  # Only use if we found a reasonable number of paragraphs
            text = ' '.join(p.get_text() for p in paragraphs)
            # Only return if we got substantial content
            if len(text) > 200:
                return text

        return None

    def _clean_text(self, text):
        """Clean and normalize extracted text"""
        if not text:
            return None

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove common cruft
        text = re.sub(r'(Advertisement|ADVERTISEMENT)', '', text)
        text = re.sub(r'(Read more:.*?\.)', '', text)

        # Strip leading/trailing whitespace
        text = text.strip()

        # Only return if we have substantial content
        if len(text) > 100:
            return text

        return None