Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
"""
News Scraper Module - Multi-Language
Supports English (ABP Live EN) and Hindi (ABP Live HI)
Exposes `scrape_articles` as a clean, callable Python function.
"""
import requests
from bs4 import BeautifulSoup
import re
import sys
import time
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Set, Dict, Optional
from urllib.parse import quote_plus
import os
# Ensure backend root is in PYTHONPATH so we can import core modules
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parent.parent))
from core.logger import logger
from core.config import config
# ─────────────────────────────────────────────
# Language Configuration
# ─────────────────────────────────────────────
class LanguageConfig:
def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder):
self.base_url = base_url
self.categories = categories
self.search_url_tpl = search_url_tpl
self.scraper_class_name = scraper_class_name
self.output_subfolder = output_subfolder
_EN_BASE = "https://news.abplive.com"
ENGLISH_CONFIG = LanguageConfig(
base_url=_EN_BASE,
categories={
"top": {"name": "Top News", "url": f"{_EN_BASE}/"},
"business": {"name": "Business", "url": f"{_EN_BASE}/business"},
"entertainment": {"name": "Entertainment", "url": f"{_EN_BASE}/entertainment"},
"sports": {"name": "Sports", "url": f"{_EN_BASE}/sports"},
"lifestyle": {"name": "Lifestyle", "url": f"{_EN_BASE}/lifestyle"},
"technology": {"name": "Technology", "url": f"{_EN_BASE}/technology"},
"elections": {"name": "Elections", "url": f"{_EN_BASE}/elections"},
},
search_url_tpl=f"{_EN_BASE}/search?s={{q}}",
scraper_class_name="EnglishScraper",
output_subfolder="english",
)
_HI_BASE = "https://www.abplive.com"
HINDI_CONFIG = LanguageConfig(
base_url=_HI_BASE,
categories={
"top": {"name": "Top News", "url": f"{_HI_BASE}/news"},
"entertainment": {"name": "Entertainment", "url": f"{_HI_BASE}/entertainment"},
"sports": {"name": "Sports", "url": f"{_HI_BASE}/sports"},
"politics": {"name": "Politics", "url": f"{_HI_BASE}/news/india"},
"latest": {"name": "Latest News", "url": f"{_HI_BASE}/news/latest-news"},
"technology": {"name": "Technology", "url": f"{_HI_BASE}/technology"},
"lifestyle": {"name": "Lifestyle", "url": f"{_HI_BASE}/lifestyle"},
"business": {"name": "Business", "url": f"{_HI_BASE}/business"},
"world": {"name": "World News", "url": f"{_HI_BASE}/news/world"},
"crime": {"name": "Crime", "url": f"{_HI_BASE}/news/crime"},
},
search_url_tpl=f"{_HI_BASE}/search?s={{q}}",
scraper_class_name="HindiScraper",
output_subfolder="hindi",
)
LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = {
"english": ENGLISH_CONFIG,
"hindi": HINDI_CONFIG,
}
# ─────────────────────────────────────────────
# Shared Utilities
# ─────────────────────────────────────────────
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
# ─────────────────────────────────────────────
# Scrapers
# ─────────────────────────────────────────────
class BaseScraper:
def __init__(self, lang_cfg: LanguageConfig):
self.lang_cfg = lang_cfg
self.headers = {"User-Agent": USER_AGENT}
def _build_search_page_url(self, encoded_query: str, page: int) -> str:
base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query)
if page <= 1:
return base_url
paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1)
if paged_url == base_url:
separator = "&" if "?" in base_url else "?"
paged_url = f"{base_url}{separator}paged={page}"
return paged_url
def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]:
links = set()
if is_search:
if not self.lang_cfg.search_url_tpl:
logger.error("Search is not supported for this language.")
return links
encoded_query = quote_plus(query)
max_pages = max(1, max_pages)
urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)]
else:
urls_to_fetch = [url]
logger.info(f"Scanning {len(urls_to_fetch)} source page(s)…")
for idx, src_url in enumerate(urls_to_fetch, 1):
try:
res = requests.get(src_url, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
if res.status_code != 200:
logger.warning(f"HTTP {res.status_code} for page {idx}")
continue
soup = BeautifulSoup(res.text, "html.parser")
new_links = self._extract_links(soup, src_url, is_search=is_search)
links |= new_links
logger.success(f"Extracted {len(new_links)} links from page {idx}")
if is_search and not new_links:
logger.info(f"No search results found on page {idx}; stopping pagination early.")
break
except requests.Timeout:
logger.error(f"Timeout on page {idx}")
except Exception as e:
logger.warning(f"Error on page {idx}: {str(e)[:80]}")
return links
def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]:
raise NotImplementedError
def parse_article(self, link: str, category: str) -> Optional[Dict]:
raise NotImplementedError
class EnglishScraper(BaseScraper):
def _extract_links(self, soup, src_url, is_search=False):
links = set()
if is_search:
container = soup.find("div", class_="search-cat-wrap")
elements = container.find_all("a", href=True) if container else []
else:
elements = soup.find_all("a", href=True)
base = self.lang_cfg.base_url
for a in elements:
href = a['href']
if href.startswith("/"):
href = base + href
if "abplive.com" in href and "javascript" not in href:
if re.search(r'-(\d+)$', href) or href.endswith('.html'):
links.add(href)
return links
def parse_article(self, link: str, category: str) -> Optional[Dict]:
try:
res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
if res.status_code != 200:
return None
soup = BeautifulSoup(res.text, "html.parser")
match = re.search(r"-(\d+)$", link)
article_id = match.group(1) if match else "N/A"
title_tag = soup.find("h1")
if not title_tag:
return None
title = title_tag.get_text(strip=True)
content_div = (
soup.find("div", class_="abp-story-article") or
soup.find("div", class_="article-content")
)
if not content_div:
return None
content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p"))
if not content:
return None
author = "ABP News"
date = datetime.now().strftime("%Y-%m-%d")
byline = soup.find("div", class_="abp-article-byline-author")
if byline:
if byline.find("a"):
author = byline.find("a").get_text(strip=True)
txt = byline.get_text(strip=True)
if "Updated at :" in txt:
date = txt.split("Updated at :")[1].strip()
return {
"id": article_id,
"language": "english",
"category": category,
"title": title,
"author": author,
"published_date": date,
"url": link,
"content": content,
"scraped_at": datetime.now(timezone.utc).isoformat(),
}
except:
return None
class HindiScraper(BaseScraper):
_ARTICLE_RE = re.compile(r'abplive\.com/.+-(\d{6,})$')
def _extract_links(self, soup, src_url, is_search=False):
links = set()
base = self.lang_cfg.base_url
if is_search:
container = soup.find("div", class_="search-cat-wrap")
elements = container.find_all("a", href=True) if container else []
else:
elements = soup.find_all("a", href=True)
for a in elements:
href = a['href'].strip()
if href.startswith("/"):
href = base + href
if self._ARTICLE_RE.search(href):
if "/photo-gallery/" not in href and "/videos/" not in href:
links.add(href.split("?")[0])
return links
def parse_article(self, link: str, category: str) -> Optional[Dict]:
try:
res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
if res.status_code != 200:
return None
soup = BeautifulSoup(res.text, "html.parser")
match = self._ARTICLE_RE.search(link)
article_id = match.group(1) if match else "N/A"
title_tag = soup.find("h1")
if not title_tag:
return None
title = title_tag.get_text(strip=True)
if not title:
return None
content_div = (
soup.find("div", class_="abp-story-detail") or
soup.find("div", class_="story-detail") or
soup.find("div", class_="article-content") or
soup.find("div", {"id": "article-content"})
)
if not content_div:
return None
paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)]
content = "\n".join(paragraphs)
if not content:
return None
author = "ABP Live"
auth_div = soup.find("div", class_="auth-detail")
if auth_div:
h3 = auth_div.find("h3")
a = auth_div.find("a")
if h3:
author = h3.get_text(strip=True)
elif a:
author = a.get_text(strip=True)
date = datetime.now().strftime("%Y-%m-%d")
time_tag = soup.find("time")
if time_tag and time_tag.get("datetime"):
raw = time_tag["datetime"]
try:
date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d")
except ValueError:
date = raw[:10]
else:
meta = soup.find("meta", {"property": "article:published_time"})
if meta and meta.get("content"):
date = meta["content"][:10]
return {
"id": article_id,
"language": "hindi",
"category": category,
"title": title,
"author": author,
"published_date": date,
"url": link,
"content": content,
"scraped_at": datetime.now(timezone.utc).isoformat(),
}
except:
return None
def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper:
classes = {
"EnglishScraper": EnglishScraper,
"HindiScraper": HindiScraper,
}
cls = classes.get(lang_cfg.scraper_class_name)
if not cls:
raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}")
return cls(lang_cfg)
# ─────────────────────────────────────────────
# Public API
# ─────────────────────────────────────────────
def scrape_articles(language: str, target: str, is_search: bool = False, max_pages: int = 1) -> List[Dict]:
"""
Scrapes news articles from the supported languages and returns them as a list of dictionaries.
Args:
language: 'english' or 'hindi'
target: The category key (e.g., 'sports') or search query string
is_search: True if target is a query string, False if it's a category
max_pages: Number of pages to scrape (useful for search)
Returns:
A list of dictionary objects representing the scraped articles.
"""
language = language.lower()
if language not in LANGUAGE_CONFIGS:
logger.error(f"Unsupported language: {language}")
return []
lang_cfg = LANGUAGE_CONFIGS[language]
scraper = get_scraper(lang_cfg)
if is_search:
category_name = target
target_url = ""
logger.info(f"[{language.upper()}] Searching: '{target}' | pages: {max_pages}")
else:
target_key = target.lower()
if target_key not in lang_cfg.categories:
logger.error(f"Unknown category '{target_key}' for {language}.")
return []
cat_info = lang_cfg.categories[target_key]
category_name = cat_info["name"]
target_url = cat_info["url"]
logger.info(f"[{language.upper()}] Scraping category: '{category_name}'")
# Phase 1: Link Discovery
links = scraper.fetch_links(target_url, is_search=is_search, query=target if is_search else "", max_pages=max_pages)
if not links:
logger.warning(f"No article links found for {target}.")
return []
logger.success(f"Discovered {len(links)} unique article links.")
# Phase 2: Content Extraction
results = []
with ThreadPoolExecutor(max_workers=config.SCRAPING_MAX_WORKERS) as executor:
futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links}
for future in as_completed(futures):
data = future.result()
if data:
results.append(data)
if results:
logger.success(f"Successfully extracted {len(results)} articles.")
else:
logger.warning("Failed to extract content for any articles.")
return results