Spaces:
Sleeping
Sleeping
File size: 15,412 Bytes
2cb327c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | """
News Scraper Module - Multi-Language
Supports English (ABP Live EN) and Hindi (ABP Live HI)
Exposes `scrape_articles` as a clean, callable Python function.
"""
import requests
from bs4 import BeautifulSoup
import re
import sys
import time
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Set, Dict, Optional
from urllib.parse import quote_plus
import os
# Ensure backend root is in PYTHONPATH so we can import core modules
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from core.logger import logger
from core.config import config
# βββββββββββββββββββββββββββββββββββββββββββββ
# Language Configuration
# βββββββββββββββββββββββββββββββββββββββββββββ
class LanguageConfig:
def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder):
self.base_url = base_url
self.categories = categories
self.search_url_tpl = search_url_tpl
self.scraper_class_name = scraper_class_name
self.output_subfolder = output_subfolder
_EN_BASE = "https://news.abplive.com"
ENGLISH_CONFIG = LanguageConfig(
base_url=_EN_BASE,
categories={
"top": {"name": "Top News", "url": f"{_EN_BASE}/"},
"business": {"name": "Business", "url": f"{_EN_BASE}/business"},
"entertainment": {"name": "Entertainment", "url": f"{_EN_BASE}/entertainment"},
"sports": {"name": "Sports", "url": f"{_EN_BASE}/sports"},
"lifestyle": {"name": "Lifestyle", "url": f"{_EN_BASE}/lifestyle"},
"technology": {"name": "Technology", "url": f"{_EN_BASE}/technology"},
"elections": {"name": "Elections", "url": f"{_EN_BASE}/elections"},
},
search_url_tpl=f"{_EN_BASE}/search?s={{q}}",
scraper_class_name="EnglishScraper",
output_subfolder="english",
)
_HI_BASE = "https://www.abplive.com"
HINDI_CONFIG = LanguageConfig(
base_url=_HI_BASE,
categories={
"top": {"name": "Top News", "url": f"{_HI_BASE}/news"},
"entertainment": {"name": "Entertainment", "url": f"{_HI_BASE}/entertainment"},
"sports": {"name": "Sports", "url": f"{_HI_BASE}/sports"},
"politics": {"name": "Politics", "url": f"{_HI_BASE}/news/india"},
"latest": {"name": "Latest News", "url": f"{_HI_BASE}/news/latest-news"},
"technology": {"name": "Technology", "url": f"{_HI_BASE}/technology"},
"lifestyle": {"name": "Lifestyle", "url": f"{_HI_BASE}/lifestyle"},
"business": {"name": "Business", "url": f"{_HI_BASE}/business"},
"world": {"name": "World News", "url": f"{_HI_BASE}/news/world"},
"crime": {"name": "Crime", "url": f"{_HI_BASE}/news/crime"},
},
search_url_tpl=f"{_HI_BASE}/search?s={{q}}",
scraper_class_name="HindiScraper",
output_subfolder="hindi",
)
LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = {
"english": ENGLISH_CONFIG,
"hindi": HINDI_CONFIG,
}
# βββββββββββββββββββββββββββββββββββββββββββββ
# Shared Utilities
# βββββββββββββββββββββββββββββββββββββββββββββ
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
# βββββββββββββββββββββββββββββββββββββββββββββ
# Scrapers
# βββββββββββββββββββββββββββββββββββββββββββββ
class BaseScraper:
def __init__(self, lang_cfg: LanguageConfig):
self.lang_cfg = lang_cfg
self.headers = {"User-Agent": USER_AGENT}
def _build_search_page_url(self, encoded_query: str, page: int) -> str:
base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query)
if page <= 1:
return base_url
paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1)
if paged_url == base_url:
separator = "&" if "?" in base_url else "?"
paged_url = f"{base_url}{separator}paged={page}"
return paged_url
def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]:
links = set()
if is_search:
if not self.lang_cfg.search_url_tpl:
logger.error("Search is not supported for this language.")
return links
encoded_query = quote_plus(query)
max_pages = max(1, max_pages)
urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)]
else:
urls_to_fetch = [url]
logger.info(f"Scanning {len(urls_to_fetch)} source page(s)β¦")
for idx, src_url in enumerate(urls_to_fetch, 1):
try:
res = requests.get(src_url, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
if res.status_code != 200:
logger.warning(f"HTTP {res.status_code} for page {idx}")
continue
soup = BeautifulSoup(res.text, "html.parser")
new_links = self._extract_links(soup, src_url, is_search=is_search)
links |= new_links
logger.success(f"Extracted {len(new_links)} links from page {idx}")
if is_search and not new_links:
logger.info(f"No search results found on page {idx}; stopping pagination early.")
break
except requests.Timeout:
logger.error(f"Timeout on page {idx}")
except Exception as e:
logger.warning(f"Error on page {idx}: {str(e)[:80]}")
return links
def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]:
raise NotImplementedError
def parse_article(self, link: str, category: str) -> Optional[Dict]:
raise NotImplementedError
class EnglishScraper(BaseScraper):
def _extract_links(self, soup, src_url, is_search=False):
links = set()
if is_search:
container = soup.find("div", class_="search-cat-wrap")
elements = container.find_all("a", href=True) if container else []
else:
elements = soup.find_all("a", href=True)
base = self.lang_cfg.base_url
for a in elements:
href = a['href']
if href.startswith("/"):
href = base + href
if "abplive.com" in href and "javascript" not in href:
if re.search(r'-(\d+)$', href) or href.endswith('.html'):
links.add(href)
return links
def parse_article(self, link: str, category: str) -> Optional[Dict]:
try:
res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
if res.status_code != 200:
return None
soup = BeautifulSoup(res.text, "html.parser")
match = re.search(r"-(\d+)$", link)
article_id = match.group(1) if match else "N/A"
title_tag = soup.find("h1")
if not title_tag:
return None
title = title_tag.get_text(strip=True)
content_div = (
soup.find("div", class_="abp-story-article") or
soup.find("div", class_="article-content")
)
if not content_div:
return None
content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p"))
if not content:
return None
author = "ABP News"
date = datetime.now().strftime("%Y-%m-%d")
byline = soup.find("div", class_="abp-article-byline-author")
if byline:
if byline.find("a"):
author = byline.find("a").get_text(strip=True)
txt = byline.get_text(strip=True)
if "Updated at :" in txt:
date = txt.split("Updated at :")[1].strip()
return {
"id": article_id,
"language": "english",
"category": category,
"title": title,
"author": author,
"published_date": date,
"url": link,
"content": content,
"scraped_at": datetime.now(timezone.utc).isoformat(),
}
except:
return None
class HindiScraper(BaseScraper):
_ARTICLE_RE = re.compile(r'abplive\.com/.+-(\d{6,})$')
def _extract_links(self, soup, src_url, is_search=False):
links = set()
base = self.lang_cfg.base_url
if is_search:
container = soup.find("div", class_="search-cat-wrap")
elements = container.find_all("a", href=True) if container else []
else:
elements = soup.find_all("a", href=True)
for a in elements:
href = a['href'].strip()
if href.startswith("/"):
href = base + href
if self._ARTICLE_RE.search(href):
if "/photo-gallery/" not in href and "/videos/" not in href:
links.add(href.split("?")[0])
return links
def parse_article(self, link: str, category: str) -> Optional[Dict]:
try:
res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
if res.status_code != 200:
return None
soup = BeautifulSoup(res.text, "html.parser")
match = self._ARTICLE_RE.search(link)
article_id = match.group(1) if match else "N/A"
title_tag = soup.find("h1")
if not title_tag:
return None
title = title_tag.get_text(strip=True)
if not title:
return None
content_div = (
soup.find("div", class_="abp-story-detail") or
soup.find("div", class_="story-detail") or
soup.find("div", class_="article-content") or
soup.find("div", {"id": "article-content"})
)
if not content_div:
return None
paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)]
content = "\n".join(paragraphs)
if not content:
return None
author = "ABP Live"
auth_div = soup.find("div", class_="auth-detail")
if auth_div:
h3 = auth_div.find("h3")
a = auth_div.find("a")
if h3:
author = h3.get_text(strip=True)
elif a:
author = a.get_text(strip=True)
date = datetime.now().strftime("%Y-%m-%d")
time_tag = soup.find("time")
if time_tag and time_tag.get("datetime"):
raw = time_tag["datetime"]
try:
date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d")
except ValueError:
date = raw[:10]
else:
meta = soup.find("meta", {"property": "article:published_time"})
if meta and meta.get("content"):
date = meta["content"][:10]
return {
"id": article_id,
"language": "hindi",
"category": category,
"title": title,
"author": author,
"published_date": date,
"url": link,
"content": content,
"scraped_at": datetime.now(timezone.utc).isoformat(),
}
except:
return None
def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper:
classes = {
"EnglishScraper": EnglishScraper,
"HindiScraper": HindiScraper,
}
cls = classes.get(lang_cfg.scraper_class_name)
if not cls:
raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}")
return cls(lang_cfg)
# βββββββββββββββββββββββββββββββββββββββββββββ
# Public API
# βββββββββββββββββββββββββββββββββββββββββββββ
def scrape_articles(language: str, target: str, is_search: bool = False, max_pages: int = 1) -> List[Dict]:
"""
Scrapes news articles from the supported languages and returns them as a list of dictionaries.
Args:
language: 'english' or 'hindi'
target: The category key (e.g., 'sports') or search query string
is_search: True if target is a query string, False if it's a category
max_pages: Number of pages to scrape (useful for search)
Returns:
A list of dictionary objects representing the scraped articles.
"""
language = language.lower()
if language not in LANGUAGE_CONFIGS:
logger.error(f"Unsupported language: {language}")
return []
lang_cfg = LANGUAGE_CONFIGS[language]
scraper = get_scraper(lang_cfg)
if is_search:
category_name = target
target_url = ""
logger.info(f"[{language.upper()}] Searching: '{target}' | pages: {max_pages}")
else:
target_key = target.lower()
if target_key not in lang_cfg.categories:
logger.error(f"Unknown category '{target_key}' for {language}.")
return []
cat_info = lang_cfg.categories[target_key]
category_name = cat_info["name"]
target_url = cat_info["url"]
logger.info(f"[{language.upper()}] Scraping category: '{category_name}'")
# Phase 1: Link Discovery
links = scraper.fetch_links(target_url, is_search=is_search, query=target if is_search else "", max_pages=max_pages)
if not links:
logger.warning(f"No article links found for {target}.")
return []
logger.success(f"Discovered {len(links)} unique article links.")
# Phase 2: Content Extraction
results = []
with ThreadPoolExecutor(max_workers=config.SCRAPING_MAX_WORKERS) as executor:
futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links}
for future in as_completed(futures):
data = future.result()
if data:
results.append(data)
if results:
logger.success(f"Successfully extracted {len(results)} articles.")
else:
logger.warning("Failed to extract content for any articles.")
return results
|