Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
"""
News Scraper β€” Multi-Language CLI Tool
=======================================
Scrapes news articles from ABP Live for English and Hindi.
Supports category-based scraping and keyword search with pagination.
Sources:
--english β†’ news.abplive.com (ABP Live English)
--hindi β†’ www.abplive.com (ABP Live Hindi)
Architecture:
LanguageConfig β€” holds URLs, categories, and output paths per language
BaseScraper β€” shared logic: link fetching, search pagination
EnglishScraper β€” extracts articles from English ABP Live
HindiScraper β€” extracts articles from Hindi ABP Live
Output:
articles/{language}/categories/{category}/{timestamp}.json
articles/{language}/search_queries/{query}/{timestamp}.json
Usage:
python backend/web_scraping/news_scrape.py --english --list
python backend/web_scraping/news_scrape.py --english --category top
python backend/web_scraping/news_scrape.py --english --search "climate change"
python backend/web_scraping/news_scrape.py --hindi --list
python backend/web_scraping/news_scrape.py --hindi --category sports
python backend/web_scraping/news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡"
python backend/web_scraping/news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡" --pages 3
Adding a new language:
1. Create a new LanguageConfig entry in LANGUAGE_CONFIGS dict
2. Create a Scraper subclass (override _extract_links and parse_article)
3. Register it in _SCRAPER_CLASSES
That's it β€” CLI, file management, and upload logic are fully reused.
"""
import requests
from bs4 import BeautifulSoup
import json
import re
import sys
sys.stdout.reconfigure(encoding='utf-8')
import os
import time
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Set, Dict, Optional
from pathlib import Path
from urllib.parse import quote_plus
from dotenv import load_dotenv
# Import shared utilities from backend.common and backend.utils
sys.path.append(str(Path(__file__).parent.parent.parent))
from backend.common.colors import Colors, Log
from backend.common.paths import get_project_root, sanitize_query_folder
from backend.utils.cloudinary_utils import upload_to_cloudinary
load_dotenv()
# ─────────────────────────────────────────────
# Language Configuration
# ─────────────────────────────────────────────
class LanguageConfig:
"""Holds everything language-specific for scraping.
Attributes:
base_url: Root URL of the news website
categories: Dict of {key: {name, url}} for each news category
search_url_tpl: Python format string for search (None = not supported)
scraper_class_name: Which Scraper subclass to instantiate
output_subfolder: Language identifier for output paths ("english" / "hindi")
"""
def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder):
self.base_url = base_url
self.categories = categories
self.search_url_tpl = search_url_tpl
self.scraper_class_name = scraper_class_name
self.output_subfolder = output_subfolder
# ── English (news.abplive.com) ────────────────────────────────────────────────
_EN_BASE = "https://news.abplive.com"
ENGLISH_CONFIG = LanguageConfig(
base_url=_EN_BASE,
categories={
"top": {"name": "Top News", "url": f"{_EN_BASE}/"},
"business": {"name": "Business", "url": f"{_EN_BASE}/business"},
"entertainment": {"name": "Entertainment", "url": f"{_EN_BASE}/entertainment"},
"sports": {"name": "Sports", "url": f"{_EN_BASE}/sports"},
"lifestyle": {"name": "Lifestyle", "url": f"{_EN_BASE}/lifestyle"},
"technology": {"name": "Technology", "url": f"{_EN_BASE}/technology"},
"elections": {"name": "Elections", "url": f"{_EN_BASE}/elections"},
},
search_url_tpl=f"{_EN_BASE}/search?s={{q}}",
scraper_class_name="EnglishScraper",
output_subfolder="english",
)
# ── Hindi (www.abplive.com) ────────────────────────────────────────────────────
_HI_BASE = "https://www.abplive.com"
HINDI_CONFIG = LanguageConfig(
base_url=_HI_BASE,
categories={
"top": {"name": "Top News", "url": f"{_HI_BASE}/news"},
"entertainment": {"name": "Entertainment", "url": f"{_HI_BASE}/entertainment"},
"sports": {"name": "Sports", "url": f"{_HI_BASE}/sports"},
"politics": {"name": "Politics", "url": f"{_HI_BASE}/news/india"},
"latest": {"name": "Latest News", "url": f"{_HI_BASE}/news/latest-news"},
"technology": {"name": "Technology", "url": f"{_HI_BASE}/technology"},
"lifestyle": {"name": "Lifestyle", "url": f"{_HI_BASE}/lifestyle"},
"business": {"name": "Business", "url": f"{_HI_BASE}/business"},
"world": {"name": "World News", "url": f"{_HI_BASE}/news/world"},
"crime": {"name": "Crime", "url": f"{_HI_BASE}/news/crime"},
},
search_url_tpl=f"{_HI_BASE}/search?s={{q}}",
scraper_class_name="HindiScraper",
output_subfolder="hindi",
)
# ── Registry of all supported languages ────────────────────────────────────────
LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = {
"english": ENGLISH_CONFIG,
"hindi": HINDI_CONFIG,
}
# ─────────────────────────────────────────────
# Shared Utilities
# ─────────────────────────────────────────────
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
MAX_WORKERS = int(os.getenv('SCRAPING_MAX_WORKERS', '10'))
TIMEOUT = int(os.getenv('SCRAPING_TIMEOUT', '30'))
class FileManager:
"""Manages output directory creation and filename generation."""
@staticmethod
def get_articles_dir(lang_subfolder: str) -> Path:
"""Returns <project_root>/articles/<lang>/"""
return get_project_root() / "articles" / lang_subfolder
@staticmethod
def ensure_category_dir(category_name: str, lang_subfolder: str) -> Path:
d = FileManager.get_articles_dir(lang_subfolder) / "categories" / \
category_name.lower().replace(" ", "_")
d.mkdir(parents=True, exist_ok=True)
return d
@staticmethod
def ensure_search_dir(query: str, lang_subfolder: str) -> Path:
safe = sanitize_query_folder(query)
d = FileManager.get_articles_dir(lang_subfolder) / "search_queries" / safe
d.mkdir(parents=True, exist_ok=True)
return d
@staticmethod
def generate_filename() -> str:
now = datetime.now()
hour = now.hour
am_pm = "am" if hour < 12 else "pm"
hour_12 = hour if hour <= 12 else hour - 12
hour_12 = 12 if hour_12 == 0 else hour_12
return f"{now.day}_{now.strftime('%b').lower()}_{hour_12}_{now.strftime('%M')}_{am_pm}.json"
class ProgressBar:
"""ASCII progress bar with ETA for terminal display."""
def __init__(self, total: int, width: int = 40):
self.total = total
self.width = width
self.current = 0
self.success_count = 0
self.fail_count = 0
self.start_time = time.time()
def update(self, success: bool = True):
self.current += 1
if success: self.success_count += 1
else: self.fail_count += 1
self._render()
def _render(self):
percent = (self.current / self.total) * 100
filled = int(self.width * self.current / self.total)
bar = '=' * filled + '-' * (self.width - filled)
elapsed = time.time() - self.start_time
if self.current > 0:
eta = (elapsed / self.current) * (self.total - self.current)
eta_str = f"{int(eta)}s" if eta < 60 else f"{int(eta/60)}m {int(eta%60)}s"
else:
eta_str = "calculating..."
sys.stdout.write(
f"\r[{bar}] {self.current}/{self.total} ({percent:.1f}%) | "
f"V {self.success_count} X {self.fail_count} | ETA: {eta_str}"
)
sys.stdout.flush()
def finish(self):
elapsed = time.time() - self.start_time
print(f"\n{Colors.DIM}Completed in {elapsed:.2f}s{Colors.RESET}")
class Statistics:
@staticmethod
def generate_summary(results: List[Dict]):
if not results:
return
print(f"\n{Colors.BOLD}{Colors.CYAN}Scraping Summary{Colors.RESET}\n")
print(f"Total Articles : {len(results)}")
authors = {a.get('author', 'Unknown') for a in results}
total_words = sum(len(a.get('content', '').split()) for a in results)
avg_words = total_words // len(results)
print(f"Unique Authors : {len(authors)}")
print(f"Total Words : {total_words:,}")
print(f"Avg Words/Art : {avg_words}\n")
# ─────────────────────────────────────────────
# Scraper base + language-specific subclasses
# ─────────────────────────────────────────────
class BaseScraper:
"""Shared scraping logic. Subclasses override:
- _extract_links(soup, url, is_search) β†’ Set[str]
- parse_article(link, category) β†’ Optional[Dict]
"""
def __init__(self, lang_cfg: LanguageConfig):
self.lang_cfg = lang_cfg
self.headers = {"User-Agent": USER_AGENT}
def _build_search_page_url(self, encoded_query: str, page: int) -> str:
base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query)
if page <= 1:
return base_url
paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1)
if paged_url == base_url:
separator = "&" if "?" in base_url else "?"
paged_url = f"{base_url}{separator}paged={page}"
return paged_url
def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]:
links = set()
urls_to_fetch = []
if is_search:
if not self.lang_cfg.search_url_tpl:
Log.error("Search is not supported for this language.")
return links
encoded_query = quote_plus(query)
max_pages = max(1, max_pages)
urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)]
else:
urls_to_fetch = [url]
Log.info(f"Scanning {len(urls_to_fetch)} source page(s)…")
for idx, src_url in enumerate(urls_to_fetch, 1):
try:
res = requests.get(src_url, headers=self.headers, timeout=TIMEOUT)
if res.status_code != 200:
Log.warning(f"HTTP {res.status_code} for page {idx}")
continue
soup = BeautifulSoup(res.text, "html.parser")
new_links = self._extract_links(soup, src_url, is_search=is_search)
links |= new_links
Log.success(f"Extracted {len(new_links)} links from page {idx}")
if is_search and not new_links:
Log.info(f"No search results found on page {idx}; stopping pagination early.")
break
except requests.Timeout:
Log.error(f"Timeout on page {idx}")
except Exception as e:
Log.warning(f"Error on page {idx}: {str(e)[:80]}")
return links
def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]:
raise NotImplementedError
def parse_article(self, link: str, category: str) -> Optional[Dict]:
raise NotImplementedError
class EnglishScraper(BaseScraper):
"""Scrapes news.abplive.com (English edition)."""
def _extract_links(self, soup, src_url, is_search=False):
links = set()
if is_search:
container = soup.find("div", class_="search-cat-wrap")
elements = container.find_all("a", href=True) if container else []
else:
elements = soup.find_all("a", href=True)
base = self.lang_cfg.base_url
for a in elements:
href = a['href']
if href.startswith("/"):
href = base + href
if "abplive.com" in href and "javascript" not in href:
if re.search(r'-(\d+)$', href) or href.endswith('.html'):
links.add(href)
return links
def parse_article(self, link: str, category: str) -> Optional[Dict]:
try:
res = requests.get(link, headers=self.headers, timeout=TIMEOUT)
if res.status_code != 200:
return None
soup = BeautifulSoup(res.text, "html.parser")
match = re.search(r"-(\d+)$", link)
article_id = match.group(1) if match else "N/A"
title_tag = soup.find("h1")
if not title_tag:
return None
title = title_tag.get_text(strip=True)
content_div = (
soup.find("div", class_="abp-story-article") or
soup.find("div", class_="article-content")
)
if not content_div:
return None
content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p"))
if not content:
return None
author = "ABP News"
date = datetime.now().strftime("%Y-%m-%d")
byline = soup.find("div", class_="abp-article-byline-author")
if byline:
if byline.find("a"):
author = byline.find("a").get_text(strip=True)
txt = byline.get_text(strip=True)
if "Updated at :" in txt:
date = txt.split("Updated at :")[1].strip()
return {
"id": article_id,
"language": "english",
"category": category,
"title": title,
"author": author,
"published_date": date,
"url": link,
"content": content,
"scraped_at": datetime.now(timezone.utc).isoformat(),
}
except:
return None
class HindiScraper(BaseScraper):
"""Scrapes www.abplive.com (Hindi edition).
Article link pattern: ends with a numeric ID, e.g. …-3094660
Title: <h1> tag
Content: <div class="abp-story-detail"> or fallback containers β€” all <p> tags
Author: <div class="auth-detail"> β†’ <h3> or first <a>
Date: <time> tag (datetime attr) or meta og:article:published_time
"""
_ARTICLE_RE = re.compile(r'abplive\.com/.+\-(\d{6,})$')
def _extract_links(self, soup, src_url, is_search=False):
links = set()
base = self.lang_cfg.base_url
if is_search:
container = soup.find("div", class_="search-cat-wrap")
elements = container.find_all("a", href=True) if container else []
else:
elements = soup.find_all("a", href=True)
for a in elements:
href = a['href'].strip()
if href.startswith("/"):
href = base + href
if self._ARTICLE_RE.search(href):
if "/photo-gallery/" not in href and "/videos/" not in href:
links.add(href.split("?")[0])
return links
def parse_article(self, link: str, category: str) -> Optional[Dict]:
try:
res = requests.get(link, headers=self.headers, timeout=TIMEOUT)
if res.status_code != 200:
return None
soup = BeautifulSoup(res.text, "html.parser")
match = self._ARTICLE_RE.search(link)
article_id = match.group(1) if match else "N/A"
title_tag = soup.find("h1")
if not title_tag:
return None
title = title_tag.get_text(strip=True)
if not title:
return None
content_div = (
soup.find("div", class_="abp-story-detail") or
soup.find("div", class_="story-detail") or
soup.find("div", class_="article-content") or
soup.find("div", {"id": "article-content"})
)
if not content_div:
return None
paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)]
content = "\n".join(paragraphs)
if not content:
return None
author = "ABP Live"
auth_div = soup.find("div", class_="auth-detail")
if auth_div:
h3 = auth_div.find("h3")
a = auth_div.find("a")
if h3:
author = h3.get_text(strip=True)
elif a:
author = a.get_text(strip=True)
date = datetime.now().strftime("%Y-%m-%d")
time_tag = soup.find("time")
if time_tag and time_tag.get("datetime"):
raw = time_tag["datetime"]
try:
date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d")
except ValueError:
date = raw[:10]
else:
meta = soup.find("meta", {"property": "article:published_time"})
if meta and meta.get("content"):
date = meta["content"][:10]
return {
"id": article_id,
"language": "hindi",
"category": category,
"title": title,
"author": author,
"published_date": date,
"url": link,
"content": content,
"scraped_at": datetime.now(timezone.utc).isoformat(),
}
except:
return None
# Scraper factory
_SCRAPER_CLASSES = {
"EnglishScraper": EnglishScraper,
"HindiScraper": HindiScraper,
}
def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper:
cls = _SCRAPER_CLASSES.get(lang_cfg.scraper_class_name)
if not cls:
raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}")
return cls(lang_cfg)
# ─────────────────────────────────────────────
# CLI
# ─────────────────────────────────────────────
def show_usage():
langs = " | ".join(f"--{l}" for l in LANGUAGE_CONFIGS)
print(f"""
{Colors.BOLD}{Colors.CYAN}News Scraper (Multi-Language){Colors.RESET}
{Colors.BOLD}Usage:{Colors.RESET}
python news_scrape.py <{langs}> --list
python news_scrape.py <{langs}> --category <name>
python news_scrape.py --english --search "query"
python news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡"
python news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡" --pages 3
{Colors.BOLD}Examples:{Colors.RESET}
python news_scrape.py --english --list
python news_scrape.py --english --category sports
python news_scrape.py --english --search "climate change"
python news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡"
python news_scrape.py --english --search "pune" --pages 3
python news_scrape.py --hindi --list
python news_scrape.py --hindi --category sports
python news_scrape.py --hindi --category politics
{Colors.BOLD}Notes:{Colors.RESET}
β€’ --search is available for both --english and --hindi
β€’ --pages / --page applies to search only and defaults to 1
β€’ Hindi output β†’ articles/hindi/categories/…
β€’ English output β†’ articles/english/categories/…
""")
def list_categories(lang_cfg: LanguageConfig, lang_label: str):
print(f"\n{Colors.BOLD}{Colors.CYAN}Available Categories [{lang_label}]:{Colors.RESET}\n")
for key, info in lang_cfg.categories.items():
print(f" {Colors.GREEN}β€’{Colors.RESET} {Colors.BOLD}{key:15}{Colors.RESET} {info['name']}")
print()
def parse_args():
"""Returns (lang_cfg, mode, category_key, query, pages)
mode: "list" | "category" | "search" | None
"""
args = sys.argv[1:]
if not args:
return None, None, None, None, 1
lang_cfg = None
lang_label = None
remaining = []
for arg in args:
key = arg.lstrip("-").lower()
if key in LANGUAGE_CONFIGS:
if lang_cfg is not None:
Log.error("Please specify only one language flag.")
return None, None, None, None, 1
lang_cfg = LANGUAGE_CONFIGS[key]
lang_label = key
else:
remaining.append(arg)
if lang_cfg is None:
Log.error("You must specify a language: " + " | ".join(f"--{l}" for l in LANGUAGE_CONFIGS))
show_usage()
return None, None, None, None, 1
if not remaining:
return lang_cfg, None, None, None, 1
pages = 1
filtered = []
i = 0
while i < len(remaining):
cmd = remaining[i].lower()
if cmd in ("--pages", "--page"):
if i + 1 >= len(remaining):
Log.error("Page count required after --pages")
return lang_cfg, None, None, None, 1
try:
pages = int(remaining[i + 1])
except ValueError:
Log.error("Page count must be an integer")
return lang_cfg, None, None, None, 1
if pages < 1:
Log.error("Page count must be at least 1")
return lang_cfg, None, None, None, 1
i += 2
continue
filtered.append(remaining[i])
i += 1
remaining = filtered
cmd = remaining[0].lower()
if cmd == "--list":
return lang_cfg, "list", None, None, pages
if cmd in ("--category", "-c"):
if len(remaining) < 2:
Log.error("Category name required after --category")
return lang_cfg, None, None, None, pages
cat_key = remaining[1].lower()
if cat_key not in lang_cfg.categories:
Log.error(f"Unknown category '{cat_key}' for {lang_label}")
Log.info(f"Run: python news_scrape.py --{lang_label} --list")
return lang_cfg, None, None, None, pages
return lang_cfg, "category", cat_key, None, pages
if cmd in ("--search", "-s"):
if lang_cfg.search_url_tpl is None:
Log.error(f"--search is not supported for --{lang_label}")
return lang_cfg, None, None, None, pages
if len(remaining) < 2:
Log.error("Search query required after --search")
return lang_cfg, None, None, None, pages
query = " ".join(remaining[1:])
return lang_cfg, "search", None, query, pages
Log.error(f"Unknown flag: {remaining[0]}")
show_usage()
return lang_cfg, None, None, None, pages
def main():
lang_cfg, mode, category_key, query, pages = parse_args()
if lang_cfg is None:
sys.exit(0)
if mode is None:
show_usage()
sys.exit(0)
lang_label = lang_cfg.output_subfolder
if mode == "list":
list_categories(lang_cfg, lang_label)
sys.exit(0)
# ── Build scraper & resolve target ────────────────────────────────────────
scraper = get_scraper(lang_cfg)
is_search = (mode == "search")
if is_search:
category_name = query
target_url = ""
Log.info(f"[{lang_label.upper()}] Searching: {Colors.BOLD}{query}{Colors.RESET} | pages: {Colors.BOLD}{pages}{Colors.RESET}")
else:
cat_info = lang_cfg.categories[category_key]
category_name = cat_info["name"]
target_url = cat_info["url"]
Log.info(f"[{lang_label.upper()}] Scraping category: {Colors.BOLD}{category_name}{Colors.RESET}")
# ── Phase 1: Link Discovery ───────────────────────────────────────────────
print(f"\n{Colors.BOLD}Phase 1: Link Discovery{Colors.RESET}\n")
links = scraper.fetch_links(target_url, is_search=is_search, query=query or "", max_pages=pages)
if not links:
Log.error("No article links found.")
sys.exit(1)
Log.success(f"Discovered {len(links)} unique article links")
# ── Phase 2: Content Extraction ──────────────────────────────────────────
print(f"\n{Colors.BOLD}Phase 2: Content Extraction{Colors.RESET}\n")
results = []
progress = ProgressBar(len(links))
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links}
for future in as_completed(futures):
data = future.result()
if data:
results.append(data)
progress.update(success=True)
else:
progress.update(success=False)
progress.finish()
Statistics.generate_summary(results)
# ── Save Results ───────────────────────────────────────────────────────────
if not results:
Log.warning("No valid articles extracted.")
sys.exit(0)
if is_search:
save_dir = FileManager.ensure_search_dir(query, lang_label)
else:
save_dir = FileManager.ensure_category_dir(category_name, lang_label)
filename = FileManager.generate_filename()
filepath = save_dir / filename
with open(filepath, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
if results:
Log.success(f"Saved {len(results)} articles -> {filepath}")
Log.info(f"File size: {os.path.getsize(filepath) / 1024:.2f} KB\n")
# ── Cloudinary Upload ──────────────────────────────────────────────────────
try:
project_root = get_project_root()
relative_path = save_dir.relative_to(project_root)
cloud_folder = str(relative_path).replace("\\", "/")
Log.info(f"Uploading to Cloudinary: {cloud_folder}")
upload_to_cloudinary(str(filepath), cloud_folder, resource_type="raw")
except Exception as e:
Log.error(f"Cloudinary upload failed: {e}")
print(f"\n{Colors.GREEN}Scraping completed successfully!{Colors.RESET}\n")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(f"\n\n{Colors.YELLOW}Cancelled by user.{Colors.RESET}\n")
sys.exit(0)
except Exception as e:
Log.error(f"Critical error: {str(e)}")
raise