Spaces:

dev11-13
/

news-whisper-api

Sleeping

File size: 15,412 Bytes

2cb327c

"""
News Scraper Module - Multi-Language
Supports English (ABP Live EN) and Hindi (ABP Live HI)
Exposes `scrape_articles` as a clean, callable Python function.
"""

import requests
from bs4 import BeautifulSoup
import re
import sys
import time
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Set, Dict, Optional
from urllib.parse import quote_plus
import os

# Ensure backend root is in PYTHONPATH so we can import core modules
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parent.parent))

from core.logger import logger
from core.config import config


# ─────────────────────────────────────────────
#  Language Configuration
# ─────────────────────────────────────────────

class LanguageConfig:
    def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder):
        self.base_url = base_url
        self.categories = categories
        self.search_url_tpl = search_url_tpl
        self.scraper_class_name = scraper_class_name
        self.output_subfolder = output_subfolder


_EN_BASE = "https://news.abplive.com"
ENGLISH_CONFIG = LanguageConfig(
    base_url=_EN_BASE,
    categories={
        "top":           {"name": "Top News",      "url": f"{_EN_BASE}/"},
        "business":      {"name": "Business",       "url": f"{_EN_BASE}/business"},
        "entertainment": {"name": "Entertainment",  "url": f"{_EN_BASE}/entertainment"},
        "sports":        {"name": "Sports",         "url": f"{_EN_BASE}/sports"},
        "lifestyle":     {"name": "Lifestyle",      "url": f"{_EN_BASE}/lifestyle"},
        "technology":    {"name": "Technology",     "url": f"{_EN_BASE}/technology"},
        "elections":     {"name": "Elections",      "url": f"{_EN_BASE}/elections"},
    },
    search_url_tpl=f"{_EN_BASE}/search?s={{q}}",
    scraper_class_name="EnglishScraper",
    output_subfolder="english",
)

_HI_BASE = "https://www.abplive.com"
HINDI_CONFIG = LanguageConfig(
    base_url=_HI_BASE,
    categories={
        "top":           {"name": "Top News",       "url": f"{_HI_BASE}/news"},
        "entertainment": {"name": "Entertainment",  "url": f"{_HI_BASE}/entertainment"},
        "sports":        {"name": "Sports",         "url": f"{_HI_BASE}/sports"},
        "politics":      {"name": "Politics",       "url": f"{_HI_BASE}/news/india"},
        "latest":        {"name": "Latest News",    "url": f"{_HI_BASE}/news/latest-news"},
        "technology":    {"name": "Technology",     "url": f"{_HI_BASE}/technology"},
        "lifestyle":     {"name": "Lifestyle",      "url": f"{_HI_BASE}/lifestyle"},
        "business":      {"name": "Business",       "url": f"{_HI_BASE}/business"},
        "world":         {"name": "World News",     "url": f"{_HI_BASE}/news/world"},
        "crime":         {"name": "Crime",          "url": f"{_HI_BASE}/news/crime"},
    },
    search_url_tpl=f"{_HI_BASE}/search?s={{q}}",
    scraper_class_name="HindiScraper",
    output_subfolder="hindi",
)

LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = {
    "english": ENGLISH_CONFIG,
    "hindi":   HINDI_CONFIG,
}


# ─────────────────────────────────────────────
#  Shared Utilities
# ─────────────────────────────────────────────

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"


# ─────────────────────────────────────────────
#  Scrapers
# ─────────────────────────────────────────────

class BaseScraper:
    def __init__(self, lang_cfg: LanguageConfig):
        self.lang_cfg = lang_cfg
        self.headers  = {"User-Agent": USER_AGENT}

    def _build_search_page_url(self, encoded_query: str, page: int) -> str:
        base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query)
        if page <= 1:
            return base_url

        paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1)
        if paged_url == base_url:
            separator = "&" if "?" in base_url else "?"
            paged_url = f"{base_url}{separator}paged={page}"
        return paged_url

    def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]:
        links = set()
        
        if is_search:
            if not self.lang_cfg.search_url_tpl:
                logger.error("Search is not supported for this language.")
                return links
            encoded_query = quote_plus(query)
            max_pages = max(1, max_pages)
            urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)]
        else:
            urls_to_fetch = [url]

        logger.info(f"Scanning {len(urls_to_fetch)} source page(s)…")

        for idx, src_url in enumerate(urls_to_fetch, 1):
            try:
                res = requests.get(src_url, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
                if res.status_code != 200:
                    logger.warning(f"HTTP {res.status_code} for page {idx}")
                    continue
                soup = BeautifulSoup(res.text, "html.parser")
                new_links = self._extract_links(soup, src_url, is_search=is_search)
                links |= new_links
                logger.success(f"Extracted {len(new_links)} links from page {idx}")
                
                if is_search and not new_links:
                    logger.info(f"No search results found on page {idx}; stopping pagination early.")
                    break
            except requests.Timeout:
                logger.error(f"Timeout on page {idx}")
            except Exception as e:
                logger.warning(f"Error on page {idx}: {str(e)[:80]}")

        return links

    def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]:
        raise NotImplementedError

    def parse_article(self, link: str, category: str) -> Optional[Dict]:
        raise NotImplementedError


class EnglishScraper(BaseScraper):
    def _extract_links(self, soup, src_url, is_search=False):
        links = set()
        if is_search:
            container = soup.find("div", class_="search-cat-wrap")
            elements  = container.find_all("a", href=True) if container else []
        else:
            elements  = soup.find_all("a", href=True)

        base = self.lang_cfg.base_url
        for a in elements:
            href = a['href']
            if href.startswith("/"):
                href = base + href
            if "abplive.com" in href and "javascript" not in href:
                if re.search(r'-(\d+)$', href) or href.endswith('.html'):
                    links.add(href)
        return links

    def parse_article(self, link: str, category: str) -> Optional[Dict]:
        try:
            res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
            if res.status_code != 200:
                return None

            soup = BeautifulSoup(res.text, "html.parser")

            match = re.search(r"-(\d+)$", link)
            article_id = match.group(1) if match else "N/A"

            title_tag = soup.find("h1")
            if not title_tag:
                return None
            title = title_tag.get_text(strip=True)

            content_div = (
                soup.find("div", class_="abp-story-article") or
                soup.find("div", class_="article-content")
            )
            if not content_div:
                return None

            content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p"))
            if not content:
                return None

            author = "ABP News"
            date = datetime.now().strftime("%Y-%m-%d")
            byline = soup.find("div", class_="abp-article-byline-author")
            if byline:
                if byline.find("a"):
                    author = byline.find("a").get_text(strip=True)
                txt = byline.get_text(strip=True)
                if "Updated at :" in txt:
                    date = txt.split("Updated at :")[1].strip()

            return {
                "id":             article_id,
                "language":       "english",
                "category":       category,
                "title":          title,
                "author":         author,
                "published_date": date,
                "url":            link,
                "content":        content,
                "scraped_at":     datetime.now(timezone.utc).isoformat(),
            }
        except:
            return None


class HindiScraper(BaseScraper):
    _ARTICLE_RE = re.compile(r'abplive\.com/.+-(\d{6,})$')

    def _extract_links(self, soup, src_url, is_search=False):
        links = set()
        base  = self.lang_cfg.base_url
        if is_search:
            container = soup.find("div", class_="search-cat-wrap")
            elements = container.find_all("a", href=True) if container else []
        else:
            elements = soup.find_all("a", href=True)

        for a in elements:
            href = a['href'].strip()
            if href.startswith("/"):
                href = base + href
            if self._ARTICLE_RE.search(href):
                if "/photo-gallery/" not in href and "/videos/" not in href:
                    links.add(href.split("?")[0])
        return links

    def parse_article(self, link: str, category: str) -> Optional[Dict]:
        try:
            res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT)
            if res.status_code != 200:
                return None

            soup = BeautifulSoup(res.text, "html.parser")

            match = self._ARTICLE_RE.search(link)
            article_id = match.group(1) if match else "N/A"

            title_tag = soup.find("h1")
            if not title_tag:
                return None
            title = title_tag.get_text(strip=True)
            if not title:
                return None

            content_div = (
                soup.find("div", class_="abp-story-detail") or
                soup.find("div", class_="story-detail") or
                soup.find("div", class_="article-content") or
                soup.find("div", {"id": "article-content"})
            )
            if not content_div:
                return None

            paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)]
            content = "\n".join(paragraphs)
            if not content:
                return None

            author = "ABP Live"
            auth_div = soup.find("div", class_="auth-detail")
            if auth_div:
                h3 = auth_div.find("h3")
                a  = auth_div.find("a")
                if h3:
                    author = h3.get_text(strip=True)
                elif a:
                    author = a.get_text(strip=True)

            date = datetime.now().strftime("%Y-%m-%d")
            time_tag = soup.find("time")
            if time_tag and time_tag.get("datetime"):
                raw = time_tag["datetime"]
                try:
                    date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d")
                except ValueError:
                    date = raw[:10]
            else:
                meta = soup.find("meta", {"property": "article:published_time"})
                if meta and meta.get("content"):
                    date = meta["content"][:10]

            return {
                "id":             article_id,
                "language":       "hindi",
                "category":       category,
                "title":          title,
                "author":         author,
                "published_date": date,
                "url":            link,
                "content":        content,
                "scraped_at":     datetime.now(timezone.utc).isoformat(),
            }
        except:
            return None


def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper:
    classes = {
        "EnglishScraper": EnglishScraper,
        "HindiScraper": HindiScraper,
    }
    cls = classes.get(lang_cfg.scraper_class_name)
    if not cls:
        raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}")
    return cls(lang_cfg)


# ─────────────────────────────────────────────
#  Public API
# ─────────────────────────────────────────────

def scrape_articles(language: str, target: str, is_search: bool = False, max_pages: int = 1) -> List[Dict]:
    """
    Scrapes news articles from the supported languages and returns them as a list of dictionaries.
    
    Args:
        language: 'english' or 'hindi'
        target: The category key (e.g., 'sports') or search query string
        is_search: True if target is a query string, False if it's a category
        max_pages: Number of pages to scrape (useful for search)
        
    Returns:
        A list of dictionary objects representing the scraped articles.
    """
    language = language.lower()
    if language not in LANGUAGE_CONFIGS:
        logger.error(f"Unsupported language: {language}")
        return []

    lang_cfg = LANGUAGE_CONFIGS[language]
    scraper = get_scraper(lang_cfg)
    
    if is_search:
        category_name = target
        target_url = ""
        logger.info(f"[{language.upper()}] Searching: '{target}' | pages: {max_pages}")
    else:
        target_key = target.lower()
        if target_key not in lang_cfg.categories:
            logger.error(f"Unknown category '{target_key}' for {language}.")
            return []
        cat_info = lang_cfg.categories[target_key]
        category_name = cat_info["name"]
        target_url = cat_info["url"]
        logger.info(f"[{language.upper()}] Scraping category: '{category_name}'")

    # Phase 1: Link Discovery
    links = scraper.fetch_links(target_url, is_search=is_search, query=target if is_search else "", max_pages=max_pages)
    
    if not links:
        logger.warning(f"No article links found for {target}.")
        return []

    logger.success(f"Discovered {len(links)} unique article links.")

    # Phase 2: Content Extraction
    results = []
    with ThreadPoolExecutor(max_workers=config.SCRAPING_MAX_WORKERS) as executor:
        futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links}
        for future in as_completed(futures):
            data = future.result()
            if data:
                results.append(data)
                
    if results:
        logger.success(f"Successfully extracted {len(results)} articles.")
    else:
        logger.warning("Failed to extract content for any articles.")
        
    return results