""" News Scraper Module - Multi-Language Supports English (ABP Live EN) and Hindi (ABP Live HI) Exposes `scrape_articles` as a clean, callable Python function. """ import requests from bs4 import BeautifulSoup import re import sys import time from datetime import datetime, timezone from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Set, Dict, Optional from urllib.parse import quote_plus import os # Ensure backend root is in PYTHONPATH so we can import core modules from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent)) from core.logger import logger from core.config import config # ───────────────────────────────────────────── # Language Configuration # ───────────────────────────────────────────── class LanguageConfig: def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder): self.base_url = base_url self.categories = categories self.search_url_tpl = search_url_tpl self.scraper_class_name = scraper_class_name self.output_subfolder = output_subfolder _EN_BASE = "https://news.abplive.com" ENGLISH_CONFIG = LanguageConfig( base_url=_EN_BASE, categories={ "top": {"name": "Top News", "url": f"{_EN_BASE}/"}, "business": {"name": "Business", "url": f"{_EN_BASE}/business"}, "entertainment": {"name": "Entertainment", "url": f"{_EN_BASE}/entertainment"}, "sports": {"name": "Sports", "url": f"{_EN_BASE}/sports"}, "lifestyle": {"name": "Lifestyle", "url": f"{_EN_BASE}/lifestyle"}, "technology": {"name": "Technology", "url": f"{_EN_BASE}/technology"}, "elections": {"name": "Elections", "url": f"{_EN_BASE}/elections"}, }, search_url_tpl=f"{_EN_BASE}/search?s={{q}}", scraper_class_name="EnglishScraper", output_subfolder="english", ) _HI_BASE = "https://www.abplive.com" HINDI_CONFIG = LanguageConfig( base_url=_HI_BASE, categories={ "top": {"name": "Top News", "url": f"{_HI_BASE}/news"}, "entertainment": {"name": "Entertainment", "url": f"{_HI_BASE}/entertainment"}, "sports": {"name": "Sports", "url": f"{_HI_BASE}/sports"}, "politics": {"name": "Politics", "url": f"{_HI_BASE}/news/india"}, "latest": {"name": "Latest News", "url": f"{_HI_BASE}/news/latest-news"}, "technology": {"name": "Technology", "url": f"{_HI_BASE}/technology"}, "lifestyle": {"name": "Lifestyle", "url": f"{_HI_BASE}/lifestyle"}, "business": {"name": "Business", "url": f"{_HI_BASE}/business"}, "world": {"name": "World News", "url": f"{_HI_BASE}/news/world"}, "crime": {"name": "Crime", "url": f"{_HI_BASE}/news/crime"}, }, search_url_tpl=f"{_HI_BASE}/search?s={{q}}", scraper_class_name="HindiScraper", output_subfolder="hindi", ) LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = { "english": ENGLISH_CONFIG, "hindi": HINDI_CONFIG, } # ───────────────────────────────────────────── # Shared Utilities # ───────────────────────────────────────────── USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" # ───────────────────────────────────────────── # Scrapers # ───────────────────────────────────────────── class BaseScraper: def __init__(self, lang_cfg: LanguageConfig): self.lang_cfg = lang_cfg self.headers = {"User-Agent": USER_AGENT} def _build_search_page_url(self, encoded_query: str, page: int) -> str: base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query) if page <= 1: return base_url paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1) if paged_url == base_url: separator = "&" if "?" in base_url else "?" paged_url = f"{base_url}{separator}paged={page}" return paged_url def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]: links = set() if is_search: if not self.lang_cfg.search_url_tpl: logger.error("Search is not supported for this language.") return links encoded_query = quote_plus(query) max_pages = max(1, max_pages) urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)] else: urls_to_fetch = [url] logger.info(f"Scanning {len(urls_to_fetch)} source page(s)…") for idx, src_url in enumerate(urls_to_fetch, 1): try: res = requests.get(src_url, headers=self.headers, timeout=config.SCRAPING_TIMEOUT) if res.status_code != 200: logger.warning(f"HTTP {res.status_code} for page {idx}") continue soup = BeautifulSoup(res.text, "html.parser") new_links = self._extract_links(soup, src_url, is_search=is_search) links |= new_links logger.success(f"Extracted {len(new_links)} links from page {idx}") if is_search and not new_links: logger.info(f"No search results found on page {idx}; stopping pagination early.") break except requests.Timeout: logger.error(f"Timeout on page {idx}") except Exception as e: logger.warning(f"Error on page {idx}: {str(e)[:80]}") return links def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]: raise NotImplementedError def parse_article(self, link: str, category: str) -> Optional[Dict]: raise NotImplementedError class EnglishScraper(BaseScraper): def _extract_links(self, soup, src_url, is_search=False): links = set() if is_search: container = soup.find("div", class_="search-cat-wrap") elements = container.find_all("a", href=True) if container else [] else: elements = soup.find_all("a", href=True) base = self.lang_cfg.base_url for a in elements: href = a['href'] if href.startswith("/"): href = base + href if "abplive.com" in href and "javascript" not in href: if re.search(r'-(\d+)$', href) or href.endswith('.html'): links.add(href) return links def parse_article(self, link: str, category: str) -> Optional[Dict]: try: res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT) if res.status_code != 200: return None soup = BeautifulSoup(res.text, "html.parser") match = re.search(r"-(\d+)$", link) article_id = match.group(1) if match else "N/A" title_tag = soup.find("h1") if not title_tag: return None title = title_tag.get_text(strip=True) content_div = ( soup.find("div", class_="abp-story-article") or soup.find("div", class_="article-content") ) if not content_div: return None content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p")) if not content: return None author = "ABP News" date = datetime.now().strftime("%Y-%m-%d") byline = soup.find("div", class_="abp-article-byline-author") if byline: if byline.find("a"): author = byline.find("a").get_text(strip=True) txt = byline.get_text(strip=True) if "Updated at :" in txt: date = txt.split("Updated at :")[1].strip() return { "id": article_id, "language": "english", "category": category, "title": title, "author": author, "published_date": date, "url": link, "content": content, "scraped_at": datetime.now(timezone.utc).isoformat(), } except: return None class HindiScraper(BaseScraper): _ARTICLE_RE = re.compile(r'abplive\.com/.+-(\d{6,})$') def _extract_links(self, soup, src_url, is_search=False): links = set() base = self.lang_cfg.base_url if is_search: container = soup.find("div", class_="search-cat-wrap") elements = container.find_all("a", href=True) if container else [] else: elements = soup.find_all("a", href=True) for a in elements: href = a['href'].strip() if href.startswith("/"): href = base + href if self._ARTICLE_RE.search(href): if "/photo-gallery/" not in href and "/videos/" not in href: links.add(href.split("?")[0]) return links def parse_article(self, link: str, category: str) -> Optional[Dict]: try: res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT) if res.status_code != 200: return None soup = BeautifulSoup(res.text, "html.parser") match = self._ARTICLE_RE.search(link) article_id = match.group(1) if match else "N/A" title_tag = soup.find("h1") if not title_tag: return None title = title_tag.get_text(strip=True) if not title: return None content_div = ( soup.find("div", class_="abp-story-detail") or soup.find("div", class_="story-detail") or soup.find("div", class_="article-content") or soup.find("div", {"id": "article-content"}) ) if not content_div: return None paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)] content = "\n".join(paragraphs) if not content: return None author = "ABP Live" auth_div = soup.find("div", class_="auth-detail") if auth_div: h3 = auth_div.find("h3") a = auth_div.find("a") if h3: author = h3.get_text(strip=True) elif a: author = a.get_text(strip=True) date = datetime.now().strftime("%Y-%m-%d") time_tag = soup.find("time") if time_tag and time_tag.get("datetime"): raw = time_tag["datetime"] try: date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d") except ValueError: date = raw[:10] else: meta = soup.find("meta", {"property": "article:published_time"}) if meta and meta.get("content"): date = meta["content"][:10] return { "id": article_id, "language": "hindi", "category": category, "title": title, "author": author, "published_date": date, "url": link, "content": content, "scraped_at": datetime.now(timezone.utc).isoformat(), } except: return None def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper: classes = { "EnglishScraper": EnglishScraper, "HindiScraper": HindiScraper, } cls = classes.get(lang_cfg.scraper_class_name) if not cls: raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}") return cls(lang_cfg) # ───────────────────────────────────────────── # Public API # ───────────────────────────────────────────── def scrape_articles(language: str, target: str, is_search: bool = False, max_pages: int = 1) -> List[Dict]: """ Scrapes news articles from the supported languages and returns them as a list of dictionaries. Args: language: 'english' or 'hindi' target: The category key (e.g., 'sports') or search query string is_search: True if target is a query string, False if it's a category max_pages: Number of pages to scrape (useful for search) Returns: A list of dictionary objects representing the scraped articles. """ language = language.lower() if language not in LANGUAGE_CONFIGS: logger.error(f"Unsupported language: {language}") return [] lang_cfg = LANGUAGE_CONFIGS[language] scraper = get_scraper(lang_cfg) if is_search: category_name = target target_url = "" logger.info(f"[{language.upper()}] Searching: '{target}' | pages: {max_pages}") else: target_key = target.lower() if target_key not in lang_cfg.categories: logger.error(f"Unknown category '{target_key}' for {language}.") return [] cat_info = lang_cfg.categories[target_key] category_name = cat_info["name"] target_url = cat_info["url"] logger.info(f"[{language.upper()}] Scraping category: '{category_name}'") # Phase 1: Link Discovery links = scraper.fetch_links(target_url, is_search=is_search, query=target if is_search else "", max_pages=max_pages) if not links: logger.warning(f"No article links found for {target}.") return [] logger.success(f"Discovered {len(links)} unique article links.") # Phase 2: Content Extraction results = [] with ThreadPoolExecutor(max_workers=config.SCRAPING_MAX_WORKERS) as executor: futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links} for future in as_completed(futures): data = future.result() if data: results.append(data) if results: logger.success(f"Successfully extracted {len(results)} articles.") else: logger.warning("Failed to extract content for any articles.") return results