Spaces:

dev11-13
/

news-whisper-api

Sleeping

news-whisper-api / backend /web_scraping /scraper.py

Devang1290

feat: deploy News Whisper on-demand search API (FastAPI + Docker)

2cb327c 13 days ago

15.4 kB

	"""
	News Scraper Module - Multi-Language
	Supports English (ABP Live EN) and Hindi (ABP Live HI)
	Exposes `scrape_articles` as a clean, callable Python function.
	"""

	import requests
	from bs4 import BeautifulSoup
	import re
	import sys
	import time
	from datetime import datetime, timezone
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from typing import List, Set, Dict, Optional
	from urllib.parse import quote_plus
	import os

	# Ensure backend root is in PYTHONPATH so we can import core modules
	from pathlib import Path
	sys.path.append(str(Path(__file__).resolve().parent.parent))

	from core.logger import logger
	from core.config import config


	# ─────────────────────────────────────────────
	# Language Configuration
	# ─────────────────────────────────────────────

	class LanguageConfig:
	def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder):
	self.base_url = base_url
	self.categories = categories
	self.search_url_tpl = search_url_tpl
	self.scraper_class_name = scraper_class_name
	self.output_subfolder = output_subfolder


	_EN_BASE = "https://news.abplive.com"
	ENGLISH_CONFIG = LanguageConfig(
	base_url=_EN_BASE,
	categories={
	"top": {"name": "Top News", "url": f"{_EN_BASE}/"},
	"business": {"name": "Business", "url": f"{_EN_BASE}/business"},
	"entertainment": {"name": "Entertainment", "url": f"{_EN_BASE}/entertainment"},
	"sports": {"name": "Sports", "url": f"{_EN_BASE}/sports"},
	"lifestyle": {"name": "Lifestyle", "url": f"{_EN_BASE}/lifestyle"},
	"technology": {"name": "Technology", "url": f"{_EN_BASE}/technology"},
	"elections": {"name": "Elections", "url": f"{_EN_BASE}/elections"},
	},
	search_url_tpl=f"{_EN_BASE}/search?s={{q}}",
	scraper_class_name="EnglishScraper",
	output_subfolder="english",
	)

	_HI_BASE = "https://www.abplive.com"
	HINDI_CONFIG = LanguageConfig(
	base_url=_HI_BASE,
	categories={
	"top": {"name": "Top News", "url": f"{_HI_BASE}/news"},
	"entertainment": {"name": "Entertainment", "url": f"{_HI_BASE}/entertainment"},
	"sports": {"name": "Sports", "url": f"{_HI_BASE}/sports"},
	"politics": {"name": "Politics", "url": f"{_HI_BASE}/news/india"},
	"latest": {"name": "Latest News", "url": f"{_HI_BASE}/news/latest-news"},
	"technology": {"name": "Technology", "url": f"{_HI_BASE}/technology"},
	"lifestyle": {"name": "Lifestyle", "url": f"{_HI_BASE}/lifestyle"},
	"business": {"name": "Business", "url": f"{_HI_BASE}/business"},
	"world": {"name": "World News", "url": f"{_HI_BASE}/news/world"},
	"crime": {"name": "Crime", "url": f"{_HI_BASE}/news/crime"},
	},
	search_url_tpl=f"{_HI_BASE}/search?s={{q}}",
	scraper_class_name="HindiScraper",
	output_subfolder="hindi",
	)

	LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = {
	"english": ENGLISH_CONFIG,
	"hindi": HINDI_CONFIG,
	}


	# ─────────────────────────────────────────────
	# Shared Utilities
	# ─────────────────────────────────────────────

	USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"


	# ─────────────────────────────────────────────
	# Scrapers
	# ─────────────────────────────────────────────

	class BaseScraper:
	def __init__(self, lang_cfg: LanguageConfig):
	self.lang_cfg = lang_cfg
	self.headers = {"User-Agent": USER_AGENT}

	def _build_search_page_url(self, encoded_query: str, page: int) -> str:
	base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query)
	if page <= 1:
	return base_url

	paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1)
	if paged_url == base_url:
	separator = "&" if "?" in base_url else "?"
	paged_url = f"{base_url}{separator}paged={page}"
	return paged_url

	def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]:
	links = set()

	if is_search:
	if not self.lang_cfg.search_url_tpl:
	logger.error("Search is not supported for this language.")
	return links
	encoded_query = quote_plus(query)
	max_pages = max(1, max_pages)
	urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)]
	else:
	urls_to_fetch = [url]

	logger.info(f"Scanning {len(urls_to_fetch)} source page(s)…")

	for idx, src_url in enumerate(urls_to_fetch, 1):
	try:
	res = requests.get(src_url, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
	if res.status_code != 200:
	logger.warning(f"HTTP {res.status_code} for page {idx}")
	continue
	soup = BeautifulSoup(res.text, "html.parser")
	new_links = self._extract_links(soup, src_url, is_search=is_search)
	links \|= new_links
	logger.success(f"Extracted {len(new_links)} links from page {idx}")

	if is_search and not new_links:
	logger.info(f"No search results found on page {idx}; stopping pagination early.")
	break
	except requests.Timeout:
	logger.error(f"Timeout on page {idx}")
	except Exception as e:
	logger.warning(f"Error on page {idx}: {str(e)[:80]}")

	return links

	def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]:
	raise NotImplementedError

	def parse_article(self, link: str, category: str) -> Optional[Dict]:
	raise NotImplementedError


	class EnglishScraper(BaseScraper):
	def _extract_links(self, soup, src_url, is_search=False):
	links = set()
	if is_search:
	container = soup.find("div", class_="search-cat-wrap")
	elements = container.find_all("a", href=True) if container else []
	else:
	elements = soup.find_all("a", href=True)

	base = self.lang_cfg.base_url
	for a in elements:
	href = a['href']
	if href.startswith("/"):
	href = base + href
	if "abplive.com" in href and "javascript" not in href:
	if re.search(r'-(\d+)$', href) or href.endswith('.html'):
	links.add(href)
	return links

	def parse_article(self, link: str, category: str) -> Optional[Dict]:
	try:
	res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
	if res.status_code != 200:
	return None

	soup = BeautifulSoup(res.text, "html.parser")

	match = re.search(r"-(\d+)$", link)
	article_id = match.group(1) if match else "N/A"

	title_tag = soup.find("h1")
	if not title_tag:
	return None
	title = title_tag.get_text(strip=True)

	content_div = (
	soup.find("div", class_="abp-story-article") or
	soup.find("div", class_="article-content")
	)
	if not content_div:
	return None

	content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p"))
	if not content:
	return None

	author = "ABP News"
	date = datetime.now().strftime("%Y-%m-%d")
	byline = soup.find("div", class_="abp-article-byline-author")
	if byline:
	if byline.find("a"):
	author = byline.find("a").get_text(strip=True)
	txt = byline.get_text(strip=True)
	if "Updated at :" in txt:
	date = txt.split("Updated at :")[1].strip()

	return {
	"id": article_id,
	"language": "english",
	"category": category,
	"title": title,
	"author": author,
	"published_date": date,
	"url": link,
	"content": content,
	"scraped_at": datetime.now(timezone.utc).isoformat(),
	}
	except:
	return None


	class HindiScraper(BaseScraper):
	_ARTICLE_RE = re.compile(r'abplive\.com/.+-(\d{6,})$')

	def _extract_links(self, soup, src_url, is_search=False):
	links = set()
	base = self.lang_cfg.base_url
	if is_search:
	container = soup.find("div", class_="search-cat-wrap")
	elements = container.find_all("a", href=True) if container else []
	else:
	elements = soup.find_all("a", href=True)

	for a in elements:
	href = a['href'].strip()
	if href.startswith("/"):
	href = base + href
	if self._ARTICLE_RE.search(href):
	if "/photo-gallery/" not in href and "/videos/" not in href:
	links.add(href.split("?")[0])
	return links

	def parse_article(self, link: str, category: str) -> Optional[Dict]:
	try:
	res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
	if res.status_code != 200:
	return None

	soup = BeautifulSoup(res.text, "html.parser")

	match = self._ARTICLE_RE.search(link)
	article_id = match.group(1) if match else "N/A"

	title_tag = soup.find("h1")
	if not title_tag:
	return None
	title = title_tag.get_text(strip=True)
	if not title:
	return None

	content_div = (
	soup.find("div", class_="abp-story-detail") or
	soup.find("div", class_="story-detail") or
	soup.find("div", class_="article-content") or
	soup.find("div", {"id": "article-content"})
	)
	if not content_div:
	return None

	paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)]
	content = "\n".join(paragraphs)
	if not content:
	return None

	author = "ABP Live"
	auth_div = soup.find("div", class_="auth-detail")
	if auth_div:
	h3 = auth_div.find("h3")
	a = auth_div.find("a")
	if h3:
	author = h3.get_text(strip=True)
	elif a:
	author = a.get_text(strip=True)

	date = datetime.now().strftime("%Y-%m-%d")
	time_tag = soup.find("time")
	if time_tag and time_tag.get("datetime"):
	raw = time_tag["datetime"]
	try:
	date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d")
	except ValueError:
	date = raw[:10]
	else:
	meta = soup.find("meta", {"property": "article:published_time"})
	if meta and meta.get("content"):
	date = meta["content"][:10]

	return {
	"id": article_id,
	"language": "hindi",
	"category": category,
	"title": title,
	"author": author,
	"published_date": date,
	"url": link,
	"content": content,
	"scraped_at": datetime.now(timezone.utc).isoformat(),
	}
	except:
	return None


	def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper:
	classes = {
	"EnglishScraper": EnglishScraper,
	"HindiScraper": HindiScraper,
	}
	cls = classes.get(lang_cfg.scraper_class_name)
	if not cls:
	raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}")
	return cls(lang_cfg)


	# ─────────────────────────────────────────────
	# Public API
	# ─────────────────────────────────────────────

	def scrape_articles(language: str, target: str, is_search: bool = False, max_pages: int = 1) -> List[Dict]:
	"""
	Scrapes news articles from the supported languages and returns them as a list of dictionaries.

	Args:
	language: 'english' or 'hindi'
	target: The category key (e.g., 'sports') or search query string
	is_search: True if target is a query string, False if it's a category
	max_pages: Number of pages to scrape (useful for search)

	Returns:
	A list of dictionary objects representing the scraped articles.
	"""
	language = language.lower()
	if language not in LANGUAGE_CONFIGS:
	logger.error(f"Unsupported language: {language}")
	return []

	lang_cfg = LANGUAGE_CONFIGS[language]
	scraper = get_scraper(lang_cfg)

	if is_search:
	category_name = target
	target_url = ""
	logger.info(f"[{language.upper()}] Searching: '{target}' \| pages: {max_pages}")
	else:
	target_key = target.lower()
	if target_key not in lang_cfg.categories:
	logger.error(f"Unknown category '{target_key}' for {language}.")
	return []
	cat_info = lang_cfg.categories[target_key]
	category_name = cat_info["name"]
	target_url = cat_info["url"]
	logger.info(f"[{language.upper()}] Scraping category: '{category_name}'")

	# Phase 1: Link Discovery
	links = scraper.fetch_links(target_url, is_search=is_search, query=target if is_search else "", max_pages=max_pages)

	if not links:
	logger.warning(f"No article links found for {target}.")
	return []

	logger.success(f"Discovered {len(links)} unique article links.")

	# Phase 2: Content Extraction
	results = []
	with ThreadPoolExecutor(max_workers=config.SCRAPING_MAX_WORKERS) as executor:
	futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links}
	for future in as_completed(futures):
	data = future.result()
	if data:
	results.append(data)

	if results:
	logger.success(f"Successfully extracted {len(results)} articles.")
	else:
	logger.warning("Failed to extract content for any articles.")

	return results