#!/usr/bin/env python3 """ News-Whisper — Unified CLI ============================ Orchestrates the complete pipeline: Scraping → Summarization → Text-to-Speech This is the main entry point. It dispatches to the correct language-specific modules based on the --english or --hindi flag. Usage: python main.py --english --category top # Process top news (English) python main.py --english --search "climate" # Search and process (English) python main.py --hindi --category sports # Process sports (Hindi) python main.py --hindi --search "पुणे" # Search and process (Hindi) python main.py --english --list # List English categories python main.py --hindi --list # List Hindi categories Pipeline Flow: English: news_scrape.py → english_summary.py → english_tts.py Hindi: news_scrape.py → hindi_summary.py → hindi_tts.py """ import sys import os sys.stdout.reconfigure(encoding='utf-8') import time import argparse from pathlib import Path from typing import Tuple from dotenv import load_dotenv load_dotenv() # Import shared utilities from backend.common.colors import Colors, Log from backend.common.paths import ( get_project_root, sanitize_query_folder, find_latest_json, find_latest_audio_dir ) import subprocess # ───────────────────────────────────────────── # Language / category definitions # ───────────────────────────────────────────── LANGUAGE_CATEGORIES = { "english": { "top": "Top News", "business": "Business", "entertainment": "Entertainment", "sports": "Sports", "lifestyle": "Lifestyle", "technology": "Technology", "elections": "Elections", }, "hindi": { "top": "Top News", "entertainment": "Entertainment", "sports": "Sports", "politics": "Politics", "latest": "Latest News", "technology": "Technology", "lifestyle": "Lifestyle", "business": "Business", "world": "World News", "crime": "Crime", }, } SEARCH_SUPPORTED = {"english", "hindi"} # ───────────────────────────────────────────── # Subprocess runner # ───────────────────────────────────────────── def run_module(script_path: Path, args: list, module_name: str) -> Tuple[bool, float]: """Run a backend script as a subprocess. Returns (success, elapsed_seconds).""" t0 = time.monotonic() try: Log.info(f"Running {module_name}…") subprocess.run( [sys.executable, str(script_path)] + args, check=True, ) elapsed = time.monotonic() - t0 Log.info(f"{module_name} finished in {Colors.CYAN}{elapsed:.1f}s{Colors.RESET}") return True, elapsed except subprocess.CalledProcessError as e: elapsed = time.monotonic() - t0 Log.error(f"{module_name} failed after {elapsed:.1f}s (exit code {e.returncode})") return False, elapsed except Exception as e: Log.error(f"Error running {module_name}: {e}") return False, 0.0 # ───────────────────────────────────────────── # Pipeline: category # ───────────────────────────────────────────── def process_category(language: str, category: str, no_dedup: bool = False) -> bool: project_root = get_project_root() # New modular script paths scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py" english_summary_script = project_root / "backend" / "summarization" / "english_summary.py" hindi_summary_script = project_root / "backend" / "summarization" / "hindi_summary.py" english_tts_script = project_root / "backend" / "text_to_speech" / "english_tts.py" hindi_tts_script = project_root / "backend" / "text_to_speech" / "hindi_tts.py" articles_dir = project_root / "articles" summarized_dir = project_root / "summarized-articles" audios_dir = project_root / "audios" categories = LANGUAGE_CATEGORIES[language] category_name = categories.get(category, category) Log.header(f"NEWS-WHISPER PIPELINE [{language.upper()}]: {category_name}") pipeline_start = time.monotonic() step_times: dict = {} # ── Step 1: Scrape ──────────────────────────────────────────────────────── Log.section("STEP 1: Web Scraping") ok, elapsed = run_module( scraper_script, [f"--{language}", "--category", category], "Web Scraper", ) step_times["Scraping"] = elapsed if not ok: Log.error("Web scraping failed. Aborting.") return False scraped_dir = articles_dir / language / "categories" / category_name.lower().replace(" ", "_") latest_scraped_json = find_latest_json(scraped_dir) if not latest_scraped_json: Log.error(f"No scraped articles found in {scraped_dir}") return False Log.success(f"Scraped articles: {latest_scraped_json}") # ── Branch: Language Pipelines ──────────────────────────────────────────── if language == "hindi": # ── Step 2: Hindi Summarization ──────────────────────────────────────── Log.section("STEP 2: Hindi Summarization (mT5 + Groq)") cmd_args = ["--file", str(latest_scraped_json)] if no_dedup: cmd_args.append("--no-dedup") ok, elapsed = run_module(hindi_summary_script, cmd_args, "Hindi Summarizer") step_times["Summarization"] = elapsed if not ok: Log.error("Hindi summarization failed.") return False # Find the summarized JSON summarized_dir_cat = summarized_dir / language / "categories" / category_name.lower().replace(" ", "_") latest_summarized_json = find_latest_json(summarized_dir_cat) if not latest_summarized_json: Log.error(f"No summarized articles found in {summarized_dir_cat}") return False Log.success(f"Summarized articles: {latest_summarized_json}") # ── Step 3: Hindi TTS ────────────────────────────────────────────────── Log.section("STEP 3: Hindi TTS (gTTS + FFmpeg)") ok, elapsed = run_module( hindi_tts_script, ["--file", str(latest_summarized_json)], "Hindi TTS", ) step_times["TTS"] = elapsed if not ok: Log.error("Hindi TTS generation failed.") return False else: # ── Step 2: English Summarization ────────────────────────────────────── Log.section("STEP 2: Article Summarization") cmd_args = ["--file", str(latest_scraped_json)] if no_dedup: cmd_args.append("--no-dedup") ok, elapsed = run_module(english_summary_script, cmd_args, "English Summarizer") step_times["Summarization"] = elapsed if not ok: Log.error("Summarization failed. Aborting.") return False summarized_dir_cat = summarized_dir / language / "categories" / category_name.lower().replace(" ", "_") latest_summarized_json = find_latest_json(summarized_dir_cat) if not latest_summarized_json: Log.error(f"No summarized articles found in {summarized_dir_cat}") return False Log.success(f"Summarized articles: {latest_summarized_json}") # ── Step 3: English TTS ──────────────────────────────────────────────── Log.section("STEP 3: Text-to-Speech Generation") ok, elapsed = run_module( english_tts_script, ["--file", str(latest_summarized_json)], "English TTS", ) step_times["TTS"] = elapsed if not ok: Log.error("TTS generation failed.") return False # ── Timing summary ──────────────────────────────────────────────────────── total = time.monotonic() - pipeline_start Log.header("PIPELINE COMPLETED SUCCESSFULLY!") print(f"{Colors.BOLD} Timing breakdown:{Colors.RESET}") for step, t in step_times.items(): print(f" {Colors.GREEN}•{Colors.RESET} {step:<25} {Colors.CYAN}{t:.1f}s{Colors.RESET}") print(f" {'─' * 35}") print(f" {Colors.BOLD}Total{Colors.RESET}{'':21} {Colors.BOLD}{Colors.CYAN}{total:.1f}s{Colors.RESET}\n") latest_audio_dir = find_latest_audio_dir(audios_dir / language, category_name) if latest_audio_dir: audio_files = list(latest_audio_dir.glob("*.mp3")) + list(latest_audio_dir.glob("*.wav")) Log.success(f"Generated {len(audio_files)} audio files") Log.info(f"Output directory: {Colors.CYAN}{latest_audio_dir}{Colors.RESET}") return True # ───────────────────────────────────────────── # Pipeline: search # ───────────────────────────────────────────── def process_search(language: str, query: str, no_dedup: bool = False, pages: int = 1) -> bool: if language not in SEARCH_SUPPORTED: Log.error(f"--search is not supported for --{language}") return False project_root = get_project_root() scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py" english_summary_script = project_root / "backend" / "summarization" / "english_summary.py" hindi_summary_script = project_root / "backend" / "summarization" / "hindi_summary.py" english_tts_script = project_root / "backend" / "text_to_speech" / "english_tts.py" hindi_tts_script = project_root / "backend" / "text_to_speech" / "hindi_tts.py" articles_dir = project_root / "articles" summarized_dir = project_root / "summarized-articles" audios_dir = project_root / "audios" safe_query = sanitize_query_folder(query) Log.header(f"NEWS-WHISPER PIPELINE [{language.upper()}]: Search '{query}'") pipeline_start = time.monotonic() step_times: dict = {} # ── Step 1: Scrape ──────────────────────────────────────────────────────── Log.section("STEP 1/3: Web Scraping") ok, elapsed = run_module( scraper_script, [f"--{language}", "--search", query, "--pages", str(max(1, pages))], "Web Scraper", ) step_times["Scraping"] = elapsed if not ok: Log.error("Web scraping failed. Aborting.") return False scraped_dir = articles_dir / language / "search_queries" / safe_query latest_scraped_json = find_latest_json(scraped_dir) if not latest_scraped_json: Log.error(f"No scraped articles found in {scraped_dir}") return False Log.success(f"Scraped articles: {latest_scraped_json}") if language == "hindi": # ── Step 2: Hindi Summarization ───────────────────────────────────── Log.section("STEP 2/3: Hindi Summarization (mT5 + Groq)") cmd_args = ["--file", str(latest_scraped_json)] if no_dedup: cmd_args.append("--no-dedup") ok, elapsed = run_module(hindi_summary_script, cmd_args, "Hindi Summarizer") step_times["Summarization"] = elapsed if not ok: Log.error("Hindi summarization failed.") return False summarized_dir_q = summarized_dir / language / "search_queries" / safe_query latest_summarized_json = find_latest_json(summarized_dir_q) if not latest_summarized_json: Log.error(f"No summarized articles found in {summarized_dir_q}") return False Log.success(f"Summarized articles: {latest_summarized_json}") # ── Step 3: Hindi TTS ─────────────────────────────────────────────── Log.section("STEP 3/3: Hindi TTS (gTTS + FFmpeg)") ok, elapsed = run_module( hindi_tts_script, ["--file", str(latest_summarized_json)], "Hindi TTS", ) step_times["TTS"] = elapsed if not ok: Log.error("Hindi TTS generation failed.") return False else: # ── Step 2: English Summarization ──────────────────────────────────── Log.section("STEP 2/3: Article Summarization") cmd_args = ["--file", str(latest_scraped_json)] if no_dedup: cmd_args.append("--no-dedup") ok, elapsed = run_module(english_summary_script, cmd_args, "English Summarizer") step_times["Summarization"] = elapsed if not ok: Log.error("Summarization failed. Aborting.") return False summarized_dir_q = summarized_dir / language / "search_queries" / safe_query latest_summarized_json = find_latest_json(summarized_dir_q) if not latest_summarized_json: Log.error(f"No summarized articles found in {summarized_dir_q}") return False Log.success(f"Summarized articles: {latest_summarized_json}") # ── Step 3: English TTS ────────────────────────────────────────────── Log.section("STEP 3/3: Text-to-Speech Generation") ok, elapsed = run_module( english_tts_script, ["--file", str(latest_summarized_json)], "English TTS", ) step_times["TTS"] = elapsed if not ok: Log.error("TTS generation failed.") return False # ── Timing summary ──────────────────────────────────────────────────────── total = time.monotonic() - pipeline_start Log.header("PIPELINE COMPLETED SUCCESSFULLY!") print(f"{Colors.BOLD} Timing breakdown:{Colors.RESET}") for step, t in step_times.items(): print(f" {Colors.GREEN}•{Colors.RESET} {step:<25} {Colors.CYAN}{t:.1f}s{Colors.RESET}") print(f" {'─' * 35}") print(f" {Colors.BOLD}Total{Colors.RESET}{'':21} {Colors.BOLD}{Colors.CYAN}{total:.1f}s{Colors.RESET}\n") search_audio_dir = audios_dir / language / "search_queries" / safe_query if search_audio_dir.exists(): audio_files = list(search_audio_dir.glob("**/*.mp3")) + list(search_audio_dir.glob("**/*.wav")) Log.success(f"Generated {len(audio_files)} audio files") Log.info(f"Output directory: {Colors.CYAN}{search_audio_dir}{Colors.RESET}") return True # ───────────────────────────────────────────── # Display helpers # ───────────────────────────────────────────── def list_categories(language: str): categories = LANGUAGE_CATEGORIES[language] Log.header(f"Available Categories [{language.upper()}]") for key, name in categories.items(): print(f" {Colors.GREEN}•{Colors.RESET} {Colors.BOLD}{key:15}{Colors.RESET} {name}") print(f"\n{Colors.DIM}Usage: python main.py --{language} --category {Colors.RESET}\n") def show_usage(): print(f""" {Colors.BOLD}{Colors.CYAN}News-Whisper — Unified CLI{Colors.RESET} {Colors.BOLD}Description:{Colors.RESET} Complete pipeline: Scraping → Summarization → Text-to-Speech A language flag (--english or --hindi) is always required. {Colors.BOLD}Usage:{Colors.RESET} python main.py --english --list python main.py --english --category python main.py --english --search "query" python main.py --hindi --search "पुणे" python main.py --hindi --search "पुणे" --pages 3 python main.py --hindi --list python main.py --hindi --category {Colors.BOLD}Examples:{Colors.RESET} python main.py --english --category top python main.py --english --category sports python main.py --english --search "climate change" python main.py --hindi --search "पुणे" python main.py --english --search "pune" --pages 3 python main.py --hindi --category sports python main.py --hindi --category politics {Colors.BOLD}Notes:{Colors.RESET} • --search is available for both --english and --hindi • --pages / --page applies to search only and defaults to 1 • Hindi supports extra categories: politics, latest, world, crime • Add --no-dedup to skip checking the Supabase registry for previously processed articles {Colors.BOLD}Environment:{Colors.RESET} Device: {Colors.CYAN}{os.getenv('DEVICE', 'cpu').upper()}{Colors.RESET} Max Workers: {Colors.CYAN}{os.getenv('MAX_WORKERS', '4')}{Colors.RESET} """) # ───────────────────────────────────────────── # Entry point # ───────────────────────────────────────────── def main(): language = None clean_args = [] for arg in sys.argv[1:]: key = arg.lstrip("-").lower() if key in LANGUAGE_CATEGORIES: language = key else: clean_args.append(arg) if not sys.argv[1:]: show_usage() sys.exit(0) if language is None: Log.error("A language flag is required: --english or --hindi") show_usage() sys.exit(1) parser = argparse.ArgumentParser(add_help=False) group = parser.add_mutually_exclusive_group() group.add_argument('--list', action='store_true') group.add_argument('--category', '-c', type=str) group.add_argument('--search', '-s', type=str, nargs='+') parser.add_argument('--pages', '--page', type=int, default=1, help="Number of search result pages to scan") parser.add_argument('--no-dedup', action='store_true', help="Skip Supabase deduplication") parser.add_argument('--help', '-h', action='store_true') args = parser.parse_args(clean_args) if args.help or not clean_args: show_usage() sys.exit(0) if args.pages < 1: Log.error("--pages must be at least 1") sys.exit(1) Log.info(f"Language : {Colors.CYAN}{language.upper()}{Colors.RESET}") Log.info(f"Device : {Colors.CYAN}{os.getenv('DEVICE', 'cpu').upper()}{Colors.RESET}") Log.info(f"Max Workers: {Colors.CYAN}{os.getenv('MAX_WORKERS', '4')}{Colors.RESET}") if args.list: list_categories(language) sys.exit(0) if args.category: category = args.category.lower() if category not in LANGUAGE_CATEGORIES[language]: Log.warning(f"'{category}' is not a predefined category for --{language}. Proceeding anyway.") success = process_category(language, category, no_dedup=args.no_dedup) sys.exit(0 if success else 1) if args.search: query = " ".join(args.search) success = process_search(language, query, no_dedup=args.no_dedup, pages=args.pages) sys.exit(0 if success else 1) show_usage() sys.exit(0) if __name__ == "__main__": try: main() except KeyboardInterrupt: print(f"\n\n{Colors.YELLOW}Pipeline cancelled by user.{Colors.RESET}\n") sys.exit(0) except Exception as e: Log.error(f"Critical error: {str(e)}") raise