Spaces:
Running
Running
| """ | |
| News Scraper β Multi-Language CLI Tool | |
| ======================================= | |
| Scrapes news articles from ABP Live for English and Hindi. | |
| Supports category-based scraping and keyword search with pagination. | |
| Sources: | |
| --english β news.abplive.com (ABP Live English) | |
| --hindi β www.abplive.com (ABP Live Hindi) | |
| Architecture: | |
| LanguageConfig β holds URLs, categories, and output paths per language | |
| BaseScraper β shared logic: link fetching, search pagination | |
| EnglishScraper β extracts articles from English ABP Live | |
| HindiScraper β extracts articles from Hindi ABP Live | |
| Output: | |
| articles/{language}/categories/{category}/{timestamp}.json | |
| articles/{language}/search_queries/{query}/{timestamp}.json | |
| Usage: | |
| python backend/web_scraping/news_scrape.py --english --list | |
| python backend/web_scraping/news_scrape.py --english --category top | |
| python backend/web_scraping/news_scrape.py --english --search "climate change" | |
| python backend/web_scraping/news_scrape.py --hindi --list | |
| python backend/web_scraping/news_scrape.py --hindi --category sports | |
| python backend/web_scraping/news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" | |
| python backend/web_scraping/news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" --pages 3 | |
| Adding a new language: | |
| 1. Create a new LanguageConfig entry in LANGUAGE_CONFIGS dict | |
| 2. Create a Scraper subclass (override _extract_links and parse_article) | |
| 3. Register it in _SCRAPER_CLASSES | |
| That's it β CLI, file management, and upload logic are fully reused. | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import json | |
| import re | |
| import sys | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| import os | |
| import time | |
| from datetime import datetime, timezone | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from typing import List, Set, Dict, Optional | |
| from pathlib import Path | |
| from urllib.parse import quote_plus | |
| from dotenv import load_dotenv | |
| # Import shared utilities from backend.common and backend.utils | |
| sys.path.append(str(Path(__file__).parent.parent.parent)) | |
| from backend.common.colors import Colors, Log | |
| from backend.common.paths import get_project_root, sanitize_query_folder | |
| from backend.utils.cloudinary_utils import upload_to_cloudinary | |
| load_dotenv() | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Language Configuration | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class LanguageConfig: | |
| """Holds everything language-specific for scraping. | |
| Attributes: | |
| base_url: Root URL of the news website | |
| categories: Dict of {key: {name, url}} for each news category | |
| search_url_tpl: Python format string for search (None = not supported) | |
| scraper_class_name: Which Scraper subclass to instantiate | |
| output_subfolder: Language identifier for output paths ("english" / "hindi") | |
| """ | |
| def __init__(self, base_url, categories, search_url_tpl, scraper_class_name, output_subfolder): | |
| self.base_url = base_url | |
| self.categories = categories | |
| self.search_url_tpl = search_url_tpl | |
| self.scraper_class_name = scraper_class_name | |
| self.output_subfolder = output_subfolder | |
| # ββ English (news.abplive.com) ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _EN_BASE = "https://news.abplive.com" | |
| ENGLISH_CONFIG = LanguageConfig( | |
| base_url=_EN_BASE, | |
| categories={ | |
| "top": {"name": "Top News", "url": f"{_EN_BASE}/"}, | |
| "business": {"name": "Business", "url": f"{_EN_BASE}/business"}, | |
| "entertainment": {"name": "Entertainment", "url": f"{_EN_BASE}/entertainment"}, | |
| "sports": {"name": "Sports", "url": f"{_EN_BASE}/sports"}, | |
| "lifestyle": {"name": "Lifestyle", "url": f"{_EN_BASE}/lifestyle"}, | |
| "technology": {"name": "Technology", "url": f"{_EN_BASE}/technology"}, | |
| "elections": {"name": "Elections", "url": f"{_EN_BASE}/elections"}, | |
| }, | |
| search_url_tpl=f"{_EN_BASE}/search?s={{q}}", | |
| scraper_class_name="EnglishScraper", | |
| output_subfolder="english", | |
| ) | |
| # ββ Hindi (www.abplive.com) ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _HI_BASE = "https://www.abplive.com" | |
| HINDI_CONFIG = LanguageConfig( | |
| base_url=_HI_BASE, | |
| categories={ | |
| "top": {"name": "Top News", "url": f"{_HI_BASE}/news"}, | |
| "entertainment": {"name": "Entertainment", "url": f"{_HI_BASE}/entertainment"}, | |
| "sports": {"name": "Sports", "url": f"{_HI_BASE}/sports"}, | |
| "politics": {"name": "Politics", "url": f"{_HI_BASE}/news/india"}, | |
| "latest": {"name": "Latest News", "url": f"{_HI_BASE}/news/latest-news"}, | |
| "technology": {"name": "Technology", "url": f"{_HI_BASE}/technology"}, | |
| "lifestyle": {"name": "Lifestyle", "url": f"{_HI_BASE}/lifestyle"}, | |
| "business": {"name": "Business", "url": f"{_HI_BASE}/business"}, | |
| "world": {"name": "World News", "url": f"{_HI_BASE}/news/world"}, | |
| "crime": {"name": "Crime", "url": f"{_HI_BASE}/news/crime"}, | |
| }, | |
| search_url_tpl=f"{_HI_BASE}/search?s={{q}}", | |
| scraper_class_name="HindiScraper", | |
| output_subfolder="hindi", | |
| ) | |
| # ββ Registry of all supported languages ββββββββββββββββββββββββββββββββββββββββ | |
| LANGUAGE_CONFIGS: Dict[str, LanguageConfig] = { | |
| "english": ENGLISH_CONFIG, | |
| "hindi": HINDI_CONFIG, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Shared Utilities | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| MAX_WORKERS = int(os.getenv('SCRAPING_MAX_WORKERS', '10')) | |
| TIMEOUT = int(os.getenv('SCRAPING_TIMEOUT', '30')) | |
| class FileManager: | |
| """Manages output directory creation and filename generation.""" | |
| def get_articles_dir(lang_subfolder: str) -> Path: | |
| """Returns <project_root>/articles/<lang>/""" | |
| return get_project_root() / "articles" / lang_subfolder | |
| def ensure_category_dir(category_name: str, lang_subfolder: str) -> Path: | |
| d = FileManager.get_articles_dir(lang_subfolder) / "categories" / \ | |
| category_name.lower().replace(" ", "_") | |
| d.mkdir(parents=True, exist_ok=True) | |
| return d | |
| def ensure_search_dir(query: str, lang_subfolder: str) -> Path: | |
| safe = sanitize_query_folder(query) | |
| d = FileManager.get_articles_dir(lang_subfolder) / "search_queries" / safe | |
| d.mkdir(parents=True, exist_ok=True) | |
| return d | |
| def generate_filename() -> str: | |
| now = datetime.now() | |
| hour = now.hour | |
| am_pm = "am" if hour < 12 else "pm" | |
| hour_12 = hour if hour <= 12 else hour - 12 | |
| hour_12 = 12 if hour_12 == 0 else hour_12 | |
| return f"{now.day}_{now.strftime('%b').lower()}_{hour_12}_{now.strftime('%M')}_{am_pm}.json" | |
| class ProgressBar: | |
| """ASCII progress bar with ETA for terminal display.""" | |
| def __init__(self, total: int, width: int = 40): | |
| self.total = total | |
| self.width = width | |
| self.current = 0 | |
| self.success_count = 0 | |
| self.fail_count = 0 | |
| self.start_time = time.time() | |
| def update(self, success: bool = True): | |
| self.current += 1 | |
| if success: self.success_count += 1 | |
| else: self.fail_count += 1 | |
| self._render() | |
| def _render(self): | |
| percent = (self.current / self.total) * 100 | |
| filled = int(self.width * self.current / self.total) | |
| bar = '=' * filled + '-' * (self.width - filled) | |
| elapsed = time.time() - self.start_time | |
| if self.current > 0: | |
| eta = (elapsed / self.current) * (self.total - self.current) | |
| eta_str = f"{int(eta)}s" if eta < 60 else f"{int(eta/60)}m {int(eta%60)}s" | |
| else: | |
| eta_str = "calculating..." | |
| sys.stdout.write( | |
| f"\r[{bar}] {self.current}/{self.total} ({percent:.1f}%) | " | |
| f"V {self.success_count} X {self.fail_count} | ETA: {eta_str}" | |
| ) | |
| sys.stdout.flush() | |
| def finish(self): | |
| elapsed = time.time() - self.start_time | |
| print(f"\n{Colors.DIM}Completed in {elapsed:.2f}s{Colors.RESET}") | |
| class Statistics: | |
| def generate_summary(results: List[Dict]): | |
| if not results: | |
| return | |
| print(f"\n{Colors.BOLD}{Colors.CYAN}Scraping Summary{Colors.RESET}\n") | |
| print(f"Total Articles : {len(results)}") | |
| authors = {a.get('author', 'Unknown') for a in results} | |
| total_words = sum(len(a.get('content', '').split()) for a in results) | |
| avg_words = total_words // len(results) | |
| print(f"Unique Authors : {len(authors)}") | |
| print(f"Total Words : {total_words:,}") | |
| print(f"Avg Words/Art : {avg_words}\n") | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Scraper base + language-specific subclasses | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class BaseScraper: | |
| """Shared scraping logic. Subclasses override: | |
| - _extract_links(soup, url, is_search) β Set[str] | |
| - parse_article(link, category) β Optional[Dict] | |
| """ | |
| def __init__(self, lang_cfg: LanguageConfig): | |
| self.lang_cfg = lang_cfg | |
| self.headers = {"User-Agent": USER_AGENT} | |
| def _build_search_page_url(self, encoded_query: str, page: int) -> str: | |
| base_url = self.lang_cfg.search_url_tpl.format(q=encoded_query) | |
| if page <= 1: | |
| return base_url | |
| paged_url = base_url.replace("/search?", f"/search/page-{page}?", 1) | |
| if paged_url == base_url: | |
| separator = "&" if "?" in base_url else "?" | |
| paged_url = f"{base_url}{separator}paged={page}" | |
| return paged_url | |
| def fetch_links(self, url: str, is_search: bool = False, query: str = "", max_pages: int = 1) -> Set[str]: | |
| links = set() | |
| urls_to_fetch = [] | |
| if is_search: | |
| if not self.lang_cfg.search_url_tpl: | |
| Log.error("Search is not supported for this language.") | |
| return links | |
| encoded_query = quote_plus(query) | |
| max_pages = max(1, max_pages) | |
| urls_to_fetch = [self._build_search_page_url(encoded_query, page) for page in range(1, max_pages + 1)] | |
| else: | |
| urls_to_fetch = [url] | |
| Log.info(f"Scanning {len(urls_to_fetch)} source page(s)β¦") | |
| for idx, src_url in enumerate(urls_to_fetch, 1): | |
| try: | |
| res = requests.get(src_url, headers=self.headers, timeout=TIMEOUT) | |
| if res.status_code != 200: | |
| Log.warning(f"HTTP {res.status_code} for page {idx}") | |
| continue | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| new_links = self._extract_links(soup, src_url, is_search=is_search) | |
| links |= new_links | |
| Log.success(f"Extracted {len(new_links)} links from page {idx}") | |
| if is_search and not new_links: | |
| Log.info(f"No search results found on page {idx}; stopping pagination early.") | |
| break | |
| except requests.Timeout: | |
| Log.error(f"Timeout on page {idx}") | |
| except Exception as e: | |
| Log.warning(f"Error on page {idx}: {str(e)[:80]}") | |
| return links | |
| def _extract_links(self, soup: BeautifulSoup, src_url: str, is_search: bool = False) -> Set[str]: | |
| raise NotImplementedError | |
| def parse_article(self, link: str, category: str) -> Optional[Dict]: | |
| raise NotImplementedError | |
| class EnglishScraper(BaseScraper): | |
| """Scrapes news.abplive.com (English edition).""" | |
| def _extract_links(self, soup, src_url, is_search=False): | |
| links = set() | |
| if is_search: | |
| container = soup.find("div", class_="search-cat-wrap") | |
| elements = container.find_all("a", href=True) if container else [] | |
| else: | |
| elements = soup.find_all("a", href=True) | |
| base = self.lang_cfg.base_url | |
| for a in elements: | |
| href = a['href'] | |
| if href.startswith("/"): | |
| href = base + href | |
| if "abplive.com" in href and "javascript" not in href: | |
| if re.search(r'-(\d+)$', href) or href.endswith('.html'): | |
| links.add(href) | |
| return links | |
| def parse_article(self, link: str, category: str) -> Optional[Dict]: | |
| try: | |
| res = requests.get(link, headers=self.headers, timeout=TIMEOUT) | |
| if res.status_code != 200: | |
| return None | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| match = re.search(r"-(\d+)$", link) | |
| article_id = match.group(1) if match else "N/A" | |
| title_tag = soup.find("h1") | |
| if not title_tag: | |
| return None | |
| title = title_tag.get_text(strip=True) | |
| content_div = ( | |
| soup.find("div", class_="abp-story-article") or | |
| soup.find("div", class_="article-content") | |
| ) | |
| if not content_div: | |
| return None | |
| content = "\n".join(p.get_text(strip=True) for p in content_div.find_all("p")) | |
| if not content: | |
| return None | |
| author = "ABP News" | |
| date = datetime.now().strftime("%Y-%m-%d") | |
| byline = soup.find("div", class_="abp-article-byline-author") | |
| if byline: | |
| if byline.find("a"): | |
| author = byline.find("a").get_text(strip=True) | |
| txt = byline.get_text(strip=True) | |
| if "Updated at :" in txt: | |
| date = txt.split("Updated at :")[1].strip() | |
| return { | |
| "id": article_id, | |
| "language": "english", | |
| "category": category, | |
| "title": title, | |
| "author": author, | |
| "published_date": date, | |
| "url": link, | |
| "content": content, | |
| "scraped_at": datetime.now(timezone.utc).isoformat(), | |
| } | |
| except: | |
| return None | |
| class HindiScraper(BaseScraper): | |
| """Scrapes www.abplive.com (Hindi edition). | |
| Article link pattern: ends with a numeric ID, e.g. β¦-3094660 | |
| Title: <h1> tag | |
| Content: <div class="abp-story-detail"> or fallback containers β all <p> tags | |
| Author: <div class="auth-detail"> β <h3> or first <a> | |
| Date: <time> tag (datetime attr) or meta og:article:published_time | |
| """ | |
| _ARTICLE_RE = re.compile(r'abplive\.com/.+\-(\d{6,})$') | |
| def _extract_links(self, soup, src_url, is_search=False): | |
| links = set() | |
| base = self.lang_cfg.base_url | |
| if is_search: | |
| container = soup.find("div", class_="search-cat-wrap") | |
| elements = container.find_all("a", href=True) if container else [] | |
| else: | |
| elements = soup.find_all("a", href=True) | |
| for a in elements: | |
| href = a['href'].strip() | |
| if href.startswith("/"): | |
| href = base + href | |
| if self._ARTICLE_RE.search(href): | |
| if "/photo-gallery/" not in href and "/videos/" not in href: | |
| links.add(href.split("?")[0]) | |
| return links | |
| def parse_article(self, link: str, category: str) -> Optional[Dict]: | |
| try: | |
| res = requests.get(link, headers=self.headers, timeout=TIMEOUT) | |
| if res.status_code != 200: | |
| return None | |
| soup = BeautifulSoup(res.text, "html.parser") | |
| match = self._ARTICLE_RE.search(link) | |
| article_id = match.group(1) if match else "N/A" | |
| title_tag = soup.find("h1") | |
| if not title_tag: | |
| return None | |
| title = title_tag.get_text(strip=True) | |
| if not title: | |
| return None | |
| content_div = ( | |
| soup.find("div", class_="abp-story-detail") or | |
| soup.find("div", class_="story-detail") or | |
| soup.find("div", class_="article-content") or | |
| soup.find("div", {"id": "article-content"}) | |
| ) | |
| if not content_div: | |
| return None | |
| paragraphs = [p.get_text(strip=True) for p in content_div.find_all("p") if p.get_text(strip=True)] | |
| content = "\n".join(paragraphs) | |
| if not content: | |
| return None | |
| author = "ABP Live" | |
| auth_div = soup.find("div", class_="auth-detail") | |
| if auth_div: | |
| h3 = auth_div.find("h3") | |
| a = auth_div.find("a") | |
| if h3: | |
| author = h3.get_text(strip=True) | |
| elif a: | |
| author = a.get_text(strip=True) | |
| date = datetime.now().strftime("%Y-%m-%d") | |
| time_tag = soup.find("time") | |
| if time_tag and time_tag.get("datetime"): | |
| raw = time_tag["datetime"] | |
| try: | |
| date = datetime.fromisoformat(raw.replace("Z", "+00:00")).strftime("%Y-%m-%d") | |
| except ValueError: | |
| date = raw[:10] | |
| else: | |
| meta = soup.find("meta", {"property": "article:published_time"}) | |
| if meta and meta.get("content"): | |
| date = meta["content"][:10] | |
| return { | |
| "id": article_id, | |
| "language": "hindi", | |
| "category": category, | |
| "title": title, | |
| "author": author, | |
| "published_date": date, | |
| "url": link, | |
| "content": content, | |
| "scraped_at": datetime.now(timezone.utc).isoformat(), | |
| } | |
| except: | |
| return None | |
| # Scraper factory | |
| _SCRAPER_CLASSES = { | |
| "EnglishScraper": EnglishScraper, | |
| "HindiScraper": HindiScraper, | |
| } | |
| def get_scraper(lang_cfg: LanguageConfig) -> BaseScraper: | |
| cls = _SCRAPER_CLASSES.get(lang_cfg.scraper_class_name) | |
| if not cls: | |
| raise ValueError(f"Unknown scraper class: {lang_cfg.scraper_class_name}") | |
| return cls(lang_cfg) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # CLI | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def show_usage(): | |
| langs = " | ".join(f"--{l}" for l in LANGUAGE_CONFIGS) | |
| print(f""" | |
| {Colors.BOLD}{Colors.CYAN}News Scraper (Multi-Language){Colors.RESET} | |
| {Colors.BOLD}Usage:{Colors.RESET} | |
| python news_scrape.py <{langs}> --list | |
| python news_scrape.py <{langs}> --category <name> | |
| python news_scrape.py --english --search "query" | |
| python news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" | |
| python news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" --pages 3 | |
| {Colors.BOLD}Examples:{Colors.RESET} | |
| python news_scrape.py --english --list | |
| python news_scrape.py --english --category sports | |
| python news_scrape.py --english --search "climate change" | |
| python news_scrape.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" | |
| python news_scrape.py --english --search "pune" --pages 3 | |
| python news_scrape.py --hindi --list | |
| python news_scrape.py --hindi --category sports | |
| python news_scrape.py --hindi --category politics | |
| {Colors.BOLD}Notes:{Colors.RESET} | |
| β’ --search is available for both --english and --hindi | |
| β’ --pages / --page applies to search only and defaults to 1 | |
| β’ Hindi output β articles/hindi/categories/β¦ | |
| β’ English output β articles/english/categories/β¦ | |
| """) | |
| def list_categories(lang_cfg: LanguageConfig, lang_label: str): | |
| print(f"\n{Colors.BOLD}{Colors.CYAN}Available Categories [{lang_label}]:{Colors.RESET}\n") | |
| for key, info in lang_cfg.categories.items(): | |
| print(f" {Colors.GREEN}β’{Colors.RESET} {Colors.BOLD}{key:15}{Colors.RESET} {info['name']}") | |
| print() | |
| def parse_args(): | |
| """Returns (lang_cfg, mode, category_key, query, pages) | |
| mode: "list" | "category" | "search" | None | |
| """ | |
| args = sys.argv[1:] | |
| if not args: | |
| return None, None, None, None, 1 | |
| lang_cfg = None | |
| lang_label = None | |
| remaining = [] | |
| for arg in args: | |
| key = arg.lstrip("-").lower() | |
| if key in LANGUAGE_CONFIGS: | |
| if lang_cfg is not None: | |
| Log.error("Please specify only one language flag.") | |
| return None, None, None, None, 1 | |
| lang_cfg = LANGUAGE_CONFIGS[key] | |
| lang_label = key | |
| else: | |
| remaining.append(arg) | |
| if lang_cfg is None: | |
| Log.error("You must specify a language: " + " | ".join(f"--{l}" for l in LANGUAGE_CONFIGS)) | |
| show_usage() | |
| return None, None, None, None, 1 | |
| if not remaining: | |
| return lang_cfg, None, None, None, 1 | |
| pages = 1 | |
| filtered = [] | |
| i = 0 | |
| while i < len(remaining): | |
| cmd = remaining[i].lower() | |
| if cmd in ("--pages", "--page"): | |
| if i + 1 >= len(remaining): | |
| Log.error("Page count required after --pages") | |
| return lang_cfg, None, None, None, 1 | |
| try: | |
| pages = int(remaining[i + 1]) | |
| except ValueError: | |
| Log.error("Page count must be an integer") | |
| return lang_cfg, None, None, None, 1 | |
| if pages < 1: | |
| Log.error("Page count must be at least 1") | |
| return lang_cfg, None, None, None, 1 | |
| i += 2 | |
| continue | |
| filtered.append(remaining[i]) | |
| i += 1 | |
| remaining = filtered | |
| cmd = remaining[0].lower() | |
| if cmd == "--list": | |
| return lang_cfg, "list", None, None, pages | |
| if cmd in ("--category", "-c"): | |
| if len(remaining) < 2: | |
| Log.error("Category name required after --category") | |
| return lang_cfg, None, None, None, pages | |
| cat_key = remaining[1].lower() | |
| if cat_key not in lang_cfg.categories: | |
| Log.error(f"Unknown category '{cat_key}' for {lang_label}") | |
| Log.info(f"Run: python news_scrape.py --{lang_label} --list") | |
| return lang_cfg, None, None, None, pages | |
| return lang_cfg, "category", cat_key, None, pages | |
| if cmd in ("--search", "-s"): | |
| if lang_cfg.search_url_tpl is None: | |
| Log.error(f"--search is not supported for --{lang_label}") | |
| return lang_cfg, None, None, None, pages | |
| if len(remaining) < 2: | |
| Log.error("Search query required after --search") | |
| return lang_cfg, None, None, None, pages | |
| query = " ".join(remaining[1:]) | |
| return lang_cfg, "search", None, query, pages | |
| Log.error(f"Unknown flag: {remaining[0]}") | |
| show_usage() | |
| return lang_cfg, None, None, None, pages | |
| def main(): | |
| lang_cfg, mode, category_key, query, pages = parse_args() | |
| if lang_cfg is None: | |
| sys.exit(0) | |
| if mode is None: | |
| show_usage() | |
| sys.exit(0) | |
| lang_label = lang_cfg.output_subfolder | |
| if mode == "list": | |
| list_categories(lang_cfg, lang_label) | |
| sys.exit(0) | |
| # ββ Build scraper & resolve target ββββββββββββββββββββββββββββββββββββββββ | |
| scraper = get_scraper(lang_cfg) | |
| is_search = (mode == "search") | |
| if is_search: | |
| category_name = query | |
| target_url = "" | |
| Log.info(f"[{lang_label.upper()}] Searching: {Colors.BOLD}{query}{Colors.RESET} | pages: {Colors.BOLD}{pages}{Colors.RESET}") | |
| else: | |
| cat_info = lang_cfg.categories[category_key] | |
| category_name = cat_info["name"] | |
| target_url = cat_info["url"] | |
| Log.info(f"[{lang_label.upper()}] Scraping category: {Colors.BOLD}{category_name}{Colors.RESET}") | |
| # ββ Phase 1: Link Discovery βββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"\n{Colors.BOLD}Phase 1: Link Discovery{Colors.RESET}\n") | |
| links = scraper.fetch_links(target_url, is_search=is_search, query=query or "", max_pages=pages) | |
| if not links: | |
| Log.error("No article links found.") | |
| sys.exit(1) | |
| Log.success(f"Discovered {len(links)} unique article links") | |
| # ββ Phase 2: Content Extraction ββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"\n{Colors.BOLD}Phase 2: Content Extraction{Colors.RESET}\n") | |
| results = [] | |
| progress = ProgressBar(len(links)) | |
| with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: | |
| futures = {executor.submit(scraper.parse_article, link, category_name): link for link in links} | |
| for future in as_completed(futures): | |
| data = future.result() | |
| if data: | |
| results.append(data) | |
| progress.update(success=True) | |
| else: | |
| progress.update(success=False) | |
| progress.finish() | |
| Statistics.generate_summary(results) | |
| # ββ Save Results βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if not results: | |
| Log.warning("No valid articles extracted.") | |
| sys.exit(0) | |
| if is_search: | |
| save_dir = FileManager.ensure_search_dir(query, lang_label) | |
| else: | |
| save_dir = FileManager.ensure_category_dir(category_name, lang_label) | |
| filename = FileManager.generate_filename() | |
| filepath = save_dir / filename | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| json.dump(results, f, indent=2, ensure_ascii=False) | |
| if results: | |
| Log.success(f"Saved {len(results)} articles -> {filepath}") | |
| Log.info(f"File size: {os.path.getsize(filepath) / 1024:.2f} KB\n") | |
| # ββ Cloudinary Upload ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| project_root = get_project_root() | |
| relative_path = save_dir.relative_to(project_root) | |
| cloud_folder = str(relative_path).replace("\\", "/") | |
| Log.info(f"Uploading to Cloudinary: {cloud_folder}") | |
| upload_to_cloudinary(str(filepath), cloud_folder, resource_type="raw") | |
| except Exception as e: | |
| Log.error(f"Cloudinary upload failed: {e}") | |
| print(f"\n{Colors.GREEN}Scraping completed successfully!{Colors.RESET}\n") | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except KeyboardInterrupt: | |
| print(f"\n\n{Colors.YELLOW}Cancelled by user.{Colors.RESET}\n") | |
| sys.exit(0) | |
| except Exception as e: | |
| Log.error(f"Critical error: {str(e)}") | |
| raise | |