Spaces:
Running
Running
File size: 4,700 Bytes
332f271 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import requests
from bs4 import BeautifulSoup
import re
import json
class ArticleScraper:
"""Web scraper for extracting full article content from news URLs"""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
def scrape_article(self, url, timeout=10):
"""
Scrape full article content from a news URL
Args:
url: Article URL to scrape
timeout: Request timeout in seconds
Returns:
Cleaned article text or None if scraping fails
"""
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
# Try multiple extraction methods in order of reliability
article_content = (
self._extract_by_schema(soup) or
self._extract_by_selector(soup, 'article') or
self._extract_by_selector(soup, '.article-body') or
self._extract_by_selector(soup, '.article-content') or
self._extract_by_selector(soup, '#article-content') or
self._extract_by_selector(soup, '.story-body') or
self._extract_by_selector(soup, '.entry-content') or
self._extract_paragraphs(soup)
)
if article_content:
return self._clean_text(article_content)
else:
print(f"Could not extract content from {url}")
return None
except requests.exceptions.Timeout:
print(f"Timeout scraping {url}")
return None
except requests.exceptions.RequestException as e:
print(f"Request error scraping {url}: {e}")
return None
except Exception as e:
print(f"Unexpected error scraping {url}: {e}")
return None
def _extract_by_selector(self, soup, selector):
"""Extract text from a CSS selector"""
element = soup.select_one(selector)
if element:
paragraphs = element.find_all('p')
if paragraphs:
return ' '.join(p.get_text() for p in paragraphs)
return None
def _extract_by_schema(self, soup):
"""Extract article body from JSON-LD schema.org metadata"""
script_tags = soup.find_all('script', type='application/ld+json')
for script_tag in script_tags:
try:
data = json.loads(script_tag.string)
# Handle both single objects and arrays
if isinstance(data, list):
for item in data:
if self._extract_article_body(item):
return self._extract_article_body(item)
else:
if self._extract_article_body(data):
return self._extract_article_body(data)
except (json.JSONDecodeError, AttributeError):
continue
return None
def _extract_article_body(self, data):
"""Extract articleBody from JSON-LD data"""
if isinstance(data, dict):
if data.get('@type') in ['Article', 'NewsArticle', 'BlogPosting']:
return data.get('articleBody')
return None
def _extract_paragraphs(self, soup):
"""Fallback: Extract all paragraph tags from body"""
# Remove script, style, nav, footer, and header elements
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
element.decompose()
# Find all paragraphs
paragraphs = soup.find_all('p')
if len(paragraphs) >= 3: # Only use if we found a reasonable number of paragraphs
text = ' '.join(p.get_text() for p in paragraphs)
# Only return if we got substantial content
if len(text) > 200:
return text
return None
def _clean_text(self, text):
"""Clean and normalize extracted text"""
if not text:
return None
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove common cruft
text = re.sub(r'(Advertisement|ADVERTISEMENT)', '', text)
text = re.sub(r'(Read more:.*?\.)', '', text)
# Strip leading/trailing whitespace
text = text.strip()
# Only return if we have substantial content
if len(text) > 100:
return text
return None
|