Spaces:
Sleeping
Sleeping
| """ | |
| News Scraper Module - Multi-Language | |
| Supports English (ABP Live EN) and Hindi (ABP Live HI) | |
| Exposes `scrape_articles` as a clean, callable Python function. | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import sys | |
| import time | |
| from datetime import datetime, timezone | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from typing import List, Set, Dict, Optional | |
| from urllib.parse import quote_plus | |
| import os | |
| # Ensure backend root is in PYTHONPATH so we can import core modules | |
| from pathlib import Path | |
| sys.path.append(str(Path(__file__).resolve().parent.parent)) | |
| from core.logger import logger | |
| from core.config import config | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Language Configuration | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class LanguageConfig: | |
| def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder): | |
| self.base_url = base_url | |
| self.categories = categories | |
| self.search_url_tpl = search_url_tpl | |
| self.scraper_class_name = scraper_class_name | |
| self.output_subfolder = output_subfolder | |
| _EN_BASE = "https://news.abplive.com" | |
| ENGLISH_CONFIG = LanguageConfig( | |
| base_url=_EN_BASE, | |
| categories={ | |
| "top": {"name": "Top News", "url": f"{_EN_BASE}/"}, | |
| "business": {"name": "Business", "url": f"{_EN_BASE}/business"}, | |
| "entertainment": {"name": "Entertainment", "url": f"{_EN_BASE}/entertainment"}, | |
| "sports": {"name": "Sports", "url": f"{_EN_BASE}/sports"}, | |
| "lifestyle": {"name": "Lifestyle", "url": f"{_EN_BASE}/lifestyle"}, | |
| "technology": {"name": "Technology", "url": f"{_EN_BASE}/technology"}, | |
| "elections": {"name": "Elections", "url": f"{_EN_BASE}/elections"}, | |
| }, | |
| search_url_tpl=f"{_EN_BASE}/search?s={{q}}", | |
| scraper_class_name="EnglishScraper", | |
| output_subfolder="english", | |
| ) | |
| _HI_BASE = "https://www.abplive.com" | |
| HINDI_CONFIG = LanguageConfig( | |
| base_url=_HI_BASE, | |
| categories={ | |
| "top": {"name": "Top News", "url": f"{_HI_BASE}/news"}, | |
| "entertainment": {"name": "Entertainment", "url": f"{_HI_BASE}/entertainment"}, | |
| "sports": {"name": "Sports", "url": f"{_HI_BASE}/sports"}, | |
| "politics": {"name": "Politics", "url": f"{_HI_BASE}/news/india"}, | |
| "latest": {"name": "Latest News", "url": f"{_HI_BASE}/news/latest-news"}, | |
| "technology": {"name": "Technology", "url": f"{_HI_BASE}/technology"}, | |
| "lifestyle": {"name": "Lifestyle", "url": f"{_HI_BASE}/lifestyle"}, | |
| "business": {"name": "Business", "url": f"{_HI_BASE}/business"}, | |
| "world": {"name": "World News", "url": f"{_HI_BASE}/news/world"}, | |
| "crime": {"name": "Crime", "url": f"{_HI_BASE}/news/crime"}, | |
| }, | |
| search_url_tpl=f"{_HI_BASE}/search?s={{q}}", | |
| scraper_class_name="HindiScraper", | |
| output_subfolder="hindi", | |
| ) | |
| LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = { | |
| "english": ENGLISH_CONFIG, | |
| "hindi": HINDI_CONFIG, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Shared Utilities | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Scrapers | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class BaseScraper: | |
| def __init__(self, lang_cfg: LanguageConfig): | |
| self.lang_cfg = lang_cfg | |
| self.headers = {"User-Agent": USER_AGENT} | |
| def _build_search_page_url(self, encoded_query: str, page: int) -> str: | |
| base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query) | |
| if page <= 1: | |
| return base_url | |
| paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1) | |
| if paged_url == base_url: | |
| separator = "&" if "?" in base_url else "?" | |
| paged_url = f"{base_url}{separator}paged={page}" | |
| return paged_url | |
| def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]: | |
| links = set() | |
| if is_search: | |
| if not self.lang_cfg.search_url_tpl: | |
| logger.error("Search is not supported for this language.") | |
| return links | |
| encoded_query = quote_plus(query) | |
| max_pages = max(1, max_pages) | |
| urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)] | |
| else: | |
| urls_to_fetch = [url] | |
| logger.info(f"Scanning {len(urls_to_fetch)} source page(s)β¦") | |
| for idx, src_url in enumerate(urls_to_fetch, 1): | |
| try: | |
| res = requests.get(src_url, headers=self.headers, timeout=config.SCRAPING_TIMEOUT) | |
| if res.status_code != 200: | |
| logger.warning(f"HTTP {res.status_code} for page {idx}") | |
| continue | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| new_links = self._extract_links(soup, src_url, is_search=is_search) | |
| links |= new_links | |
| logger.success(f"Extracted {len(new_links)} links from page {idx}") | |
| if is_search and not new_links: | |
| logger.info(f"No search results found on page {idx}; stopping pagination early.") | |
| break | |
| except requests.Timeout: | |
| logger.error(f"Timeout on page {idx}") | |
| except Exception as e: | |
| logger.warning(f"Error on page {idx}: {str(e)[:80]}") | |
| return links | |
| def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]: | |
| raise NotImplementedError | |
| def parse_article(self, link: str, category: str) -> Optional[Dict]: | |
| raise NotImplementedError | |
| class EnglishScraper(BaseScraper): | |
| def _extract_links(self, soup, src_url, is_search=False): | |
| links = set() | |
| if is_search: | |
| container = soup.find("div", class_="search-cat-wrap") | |
| elements = container.find_all("a", href=True) if container else [] | |
| else: | |
| elements = soup.find_all("a", href=True) | |
| base = self.lang_cfg.base_url | |
| for a in elements: | |
| href = a['href'] | |
| if href.startswith("/"): | |
| href = base + href | |
| if "abplive.com" in href and "javascript" not in href: | |
| if re.search(r'-(\d+)$', href) or href.endswith('.html'): | |
| links.add(href) | |
| return links | |
| def parse_article(self, link: str, category: str) -> Optional[Dict]: | |
| try: | |
| res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT) | |
| if res.status_code != 200: | |
| return None | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| match = re.search(r"-(\d+)$", link) | |
| article_id = match.group(1) if match else "N/A" | |
| title_tag = soup.find("h1") | |
| if not title_tag: | |
| return None | |
| title = title_tag.get_text(strip=True) | |
| content_div = ( | |
| soup.find("div", class_="abp-story-article") or | |
| soup.find("div", class_="article-content") | |
| ) | |
| if not content_div: | |
| return None | |
| content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p")) | |
| if not content: | |
| return None | |
| author = "ABP News" | |
| date = datetime.now().strftime("%Y-%m-%d") | |
| byline = soup.find("div", class_="abp-article-byline-author") | |
| if byline: | |
| if byline.find("a"): | |
| author = byline.find("a").get_text(strip=True) | |
| txt = byline.get_text(strip=True) | |
| if "Updated at :" in txt: | |
| date = txt.split("Updated at :")[1].strip() | |
| return { | |
| "id": article_id, | |
| "language": "english", | |
| "category": category, | |
| "title": title, | |
| "author": author, | |
| "published_date": date, | |
| "url": link, | |
| "content": content, | |
| "scraped_at": datetime.now(timezone.utc).isoformat(), | |
| } | |
| except: | |
| return None | |
| class HindiScraper(BaseScraper): | |
| _ARTICLE_RE = re.compile(r'abplive\.com/.+-(\d{6,})$') | |
| def _extract_links(self, soup, src_url, is_search=False): | |
| links = set() | |
| base = self.lang_cfg.base_url | |
| if is_search: | |
| container = soup.find("div", class_="search-cat-wrap") | |
| elements = container.find_all("a", href=True) if container else [] | |
| else: | |
| elements = soup.find_all("a", href=True) | |
| for a in elements: | |
| href = a['href'].strip() | |
| if href.startswith("/"): | |
| href = base + href | |
| if self._ARTICLE_RE.search(href): | |
| if "/photo-gallery/" not in href and "/videos/" not in href: | |
| links.add(href.split("?")[0]) | |
| return links | |
| def parse_article(self, link: str, category: str) -> Optional[Dict]: | |
| try: | |
| res = requests.get(link, headers=self.headers, timeout=config.SCRAPING_TIMEOUT) | |
| if res.status_code != 200: | |
| return None | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| match = self._ARTICLE_RE.search(link) | |
| article_id = match.group(1) if match else "N/A" | |
| title_tag = soup.find("h1") | |
| if not title_tag: | |
| return None | |
| title = title_tag.get_text(strip=True) | |
| if not title: | |
| return None | |
| content_div = ( | |
| soup.find("div", class_="abp-story-detail") or | |
| soup.find("div", class_="story-detail") or | |
| soup.find("div", class_="article-content") or | |
| soup.find("div", {"id": "article-content"}) | |
| ) | |
| if not content_div: | |
| return None | |
| paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)] | |
| content = "\n".join(paragraphs) | |
| if not content: | |
| return None | |
| author = "ABP Live" | |
| auth_div = soup.find("div", class_="auth-detail") | |
| if auth_div: | |
| h3 = auth_div.find("h3") | |
| a = auth_div.find("a") | |
| if h3: | |
| author = h3.get_text(strip=True) | |
| elif a: | |
| author = a.get_text(strip=True) | |
| date = datetime.now().strftime("%Y-%m-%d") | |
| time_tag = soup.find("time") | |
| if time_tag and time_tag.get("datetime"): | |
| raw = time_tag["datetime"] | |
| try: | |
| date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d") | |
| except ValueError: | |
| date = raw[:10] | |
| else: | |
| meta = soup.find("meta", {"property": "article:published_time"}) | |
| if meta and meta.get("content"): | |
| date = meta["content"][:10] | |
| return { | |
| "id": article_id, | |
| "language": "hindi", | |
| "category": category, | |
| "title": title, | |
| "author": author, | |
| "published_date": date, | |
| "url": link, | |
| "content": content, | |
| "scraped_at": datetime.now(timezone.utc).isoformat(), | |
| } | |
| except: | |
| return None | |
| def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper: | |
| classes = { | |
| "EnglishScraper": EnglishScraper, | |
| "HindiScraper": HindiScraper, | |
| } | |
| cls = classes.get(lang_cfg.scraper_class_name) | |
| if not cls: | |
| raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}") | |
| return cls(lang_cfg) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Public API | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def scrape_articles(language: str, target: str, is_search: bool = False, max_pages: int = 1) -> List[Dict]: | |
| """ | |
| Scrapes news articles from the supported languages and returns them as a list of dictionaries. | |
| Args: | |
| language: 'english' or 'hindi' | |
| target: The category key (e.g., 'sports') or search query string | |
| is_search: True if target is a query string, False if it's a category | |
| max_pages: Number of pages to scrape (useful for search) | |
| Returns: | |
| A list of dictionary objects representing the scraped articles. | |
| """ | |
| language = language.lower() | |
| if language not in LANGUAGE_CONFIGS: | |
| logger.error(f"Unsupported language: {language}") | |
| return [] | |
| lang_cfg = LANGUAGE_CONFIGS[language] | |
| scraper = get_scraper(lang_cfg) | |
| if is_search: | |
| category_name = target | |
| target_url = "" | |
| logger.info(f"[{language.upper()}] Searching: '{target}' | pages: {max_pages}") | |
| else: | |
| target_key = target.lower() | |
| if target_key not in lang_cfg.categories: | |
| logger.error(f"Unknown category '{target_key}' for {language}.") | |
| return [] | |
| cat_info = lang_cfg.categories[target_key] | |
| category_name = cat_info["name"] | |
| target_url = cat_info["url"] | |
| logger.info(f"[{language.upper()}] Scraping category: '{category_name}'") | |
| # Phase 1: Link Discovery | |
| links = scraper.fetch_links(target_url, is_search=is_search, query=target if is_search else "", max_pages=max_pages) | |
| if not links: | |
| logger.warning(f"No article links found for {target}.") | |
| return [] | |
| logger.success(f"Discovered {len(links)} unique article links.") | |
| # Phase 2: Content Extraction | |
| results = [] | |
| with ThreadPoolExecutor(max_workers=config.SCRAPING_MAX_WORKERS) as executor: | |
| futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links} | |
| for future in as_completed(futures): | |
| data = future.result() | |
| if data: | |
| results.append(data) | |
| if results: | |
| logger.success(f"Successfully extracted {len(results)} articles.") | |
| else: | |
| logger.warning("Failed to extract content for any articles.") | |
| return results | |