Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| News-Whisper β Unified CLI | |
| ============================ | |
| Orchestrates the complete pipeline: Scraping β Summarization β Text-to-Speech | |
| This is the main entry point. It dispatches to the correct language-specific | |
| modules based on the --english or --hindi flag. | |
| Usage: | |
| python main.py --english --category top # Process top news (English) | |
| python main.py --english --search "climate" # Search and process (English) | |
| python main.py --hindi --category sports # Process sports (Hindi) | |
| python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" # Search and process (Hindi) | |
| python main.py --english --list # List English categories | |
| python main.py --hindi --list # List Hindi categories | |
| Pipeline Flow: | |
| English: news_scrape.py β english_summary.py β english_tts.py | |
| Hindi: news_scrape.py β hindi_summary.py β hindi_tts.py | |
| """ | |
| import sys | |
| import os | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| import time | |
| import argparse | |
| from pathlib import Path | |
| from typing import Tuple | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # Import shared utilities | |
| from backend.common.colors import Colors, Log | |
| from backend.common.paths import ( | |
| get_project_root, sanitize_query_folder, | |
| find_latest_json, find_latest_audio_dir | |
| ) | |
| import subprocess | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Language / category definitions | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| LANGUAGE_CATEGORIES = { | |
| "english": { | |
| "top": "Top News", | |
| "business": "Business", | |
| "entertainment": "Entertainment", | |
| "sports": "Sports", | |
| "lifestyle": "Lifestyle", | |
| "technology": "Technology", | |
| "elections": "Elections", | |
| }, | |
| "hindi": { | |
| "top": "Top News", | |
| "entertainment": "Entertainment", | |
| "sports": "Sports", | |
| "politics": "Politics", | |
| "latest": "Latest News", | |
| "technology": "Technology", | |
| "lifestyle": "Lifestyle", | |
| "business": "Business", | |
| "world": "World News", | |
| "crime": "Crime", | |
| }, | |
| } | |
| SEARCH_SUPPORTED = {"english", "hindi"} | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Subprocess runner | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_module(script_path: Path, args: list, module_name: str) -> Tuple[bool, float]: | |
| """Run a backend script as a subprocess. Returns (success, elapsed_seconds).""" | |
| t0 = time.monotonic() | |
| try: | |
| Log.info(f"Running {module_name}β¦") | |
| subprocess.run( | |
| [sys.executable, str(script_path)] + args, | |
| check=True, | |
| ) | |
| elapsed = time.monotonic() - t0 | |
| Log.info(f"{module_name} finished in {Colors.CYAN}{elapsed:.1f}s{Colors.RESET}") | |
| return True, elapsed | |
| except subprocess.CalledProcessError as e: | |
| elapsed = time.monotonic() - t0 | |
| Log.error(f"{module_name} failed after {elapsed:.1f}s (exit code {e.returncode})") | |
| return False, elapsed | |
| except Exception as e: | |
| Log.error(f"Error running {module_name}: {e}") | |
| return False, 0.0 | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Pipeline: category | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_category(language: str, category: str, no_dedup: bool = False) -> bool: | |
| project_root = get_project_root() | |
| # New modular script paths | |
| scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py" | |
| english_summary_script = project_root / "backend" / "summarization" / "english_summary.py" | |
| hindi_summary_script = project_root / "backend" / "summarization" / "hindi_summary.py" | |
| english_tts_script = project_root / "backend" / "text_to_speech" / "english_tts.py" | |
| hindi_tts_script = project_root / "backend" / "text_to_speech" / "hindi_tts.py" | |
| articles_dir = project_root / "articles" | |
| summarized_dir = project_root / "summarized-articles" | |
| audios_dir = project_root / "audios" | |
| categories = LANGUAGE_CATEGORIES[language] | |
| category_name = categories.get(category, category) | |
| Log.header(f"NEWS-WHISPER PIPELINE [{language.upper()}]: {category_name}") | |
| pipeline_start = time.monotonic() | |
| step_times: dict = {} | |
| # ββ Step 1: Scrape ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 1: Web Scraping") | |
| ok, elapsed = run_module( | |
| scraper_script, | |
| [f"--{language}", "--category", category], | |
| "Web Scraper", | |
| ) | |
| step_times["Scraping"] = elapsed | |
| if not ok: | |
| Log.error("Web scraping failed. Aborting.") | |
| return False | |
| scraped_dir = articles_dir / language / "categories" / category_name.lower().replace(" ", "_") | |
| latest_scraped_json = find_latest_json(scraped_dir) | |
| if not latest_scraped_json: | |
| Log.error(f"No scraped articles found in {scraped_dir}") | |
| return False | |
| Log.success(f"Scraped articles: {latest_scraped_json}") | |
| # ββ Branch: Language Pipelines ββββββββββββββββββββββββββββββββββββββββββββ | |
| if language == "hindi": | |
| # ββ Step 2: Hindi Summarization ββββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 2: Hindi Summarization (mT5 + Groq)") | |
| cmd_args = ["--file", str(latest_scraped_json)] | |
| if no_dedup: cmd_args.append("--no-dedup") | |
| ok, elapsed = run_module(hindi_summary_script, cmd_args, "Hindi Summarizer") | |
| step_times["Summarization"] = elapsed | |
| if not ok: | |
| Log.error("Hindi summarization failed.") | |
| return False | |
| # Find the summarized JSON | |
| summarized_dir_cat = summarized_dir / language / "categories" / category_name.lower().replace(" ", "_") | |
| latest_summarized_json = find_latest_json(summarized_dir_cat) | |
| if not latest_summarized_json: | |
| Log.error(f"No summarized articles found in {summarized_dir_cat}") | |
| return False | |
| Log.success(f"Summarized articles: {latest_summarized_json}") | |
| # ββ Step 3: Hindi TTS ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 3: Hindi TTS (gTTS + FFmpeg)") | |
| ok, elapsed = run_module( | |
| hindi_tts_script, | |
| ["--file", str(latest_summarized_json)], | |
| "Hindi TTS", | |
| ) | |
| step_times["TTS"] = elapsed | |
| if not ok: | |
| Log.error("Hindi TTS generation failed.") | |
| return False | |
| else: | |
| # ββ Step 2: English Summarization ββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 2: Article Summarization") | |
| cmd_args = ["--file", str(latest_scraped_json)] | |
| if no_dedup: cmd_args.append("--no-dedup") | |
| ok, elapsed = run_module(english_summary_script, cmd_args, "English Summarizer") | |
| step_times["Summarization"] = elapsed | |
| if not ok: | |
| Log.error("Summarization failed. Aborting.") | |
| return False | |
| summarized_dir_cat = summarized_dir / language / "categories" / category_name.lower().replace(" ", "_") | |
| latest_summarized_json = find_latest_json(summarized_dir_cat) | |
| if not latest_summarized_json: | |
| Log.error(f"No summarized articles found in {summarized_dir_cat}") | |
| return False | |
| Log.success(f"Summarized articles: {latest_summarized_json}") | |
| # ββ Step 3: English TTS ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 3: Text-to-Speech Generation") | |
| ok, elapsed = run_module( | |
| english_tts_script, | |
| ["--file", str(latest_summarized_json)], | |
| "English TTS", | |
| ) | |
| step_times["TTS"] = elapsed | |
| if not ok: | |
| Log.error("TTS generation failed.") | |
| return False | |
| # ββ Timing summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| total = time.monotonic() - pipeline_start | |
| Log.header("PIPELINE COMPLETED SUCCESSFULLY!") | |
| print(f"{Colors.BOLD} Timing breakdown:{Colors.RESET}") | |
| for step, t in step_times.items(): | |
| print(f" {Colors.GREEN}β’{Colors.RESET} {step:<25} {Colors.CYAN}{t:.1f}s{Colors.RESET}") | |
| print(f" {'β' * 35}") | |
| print(f" {Colors.BOLD}Total{Colors.RESET}{'':21} {Colors.BOLD}{Colors.CYAN}{total:.1f}s{Colors.RESET}\n") | |
| latest_audio_dir = find_latest_audio_dir(audios_dir / language, category_name) | |
| if latest_audio_dir: | |
| audio_files = list(latest_audio_dir.glob("*.mp3")) + list(latest_audio_dir.glob("*.wav")) | |
| Log.success(f"Generated {len(audio_files)} audio files") | |
| Log.info(f"Output directory: {Colors.CYAN}{latest_audio_dir}{Colors.RESET}") | |
| return True | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Pipeline: search | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_search(language: str, query: str, no_dedup: bool = False, pages: int = 1) -> bool: | |
| if language not in SEARCH_SUPPORTED: | |
| Log.error(f"--search is not supported for --{language}") | |
| return False | |
| project_root = get_project_root() | |
| scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py" | |
| english_summary_script = project_root / "backend" / "summarization" / "english_summary.py" | |
| hindi_summary_script = project_root / "backend" / "summarization" / "hindi_summary.py" | |
| english_tts_script = project_root / "backend" / "text_to_speech" / "english_tts.py" | |
| hindi_tts_script = project_root / "backend" / "text_to_speech" / "hindi_tts.py" | |
| articles_dir = project_root / "articles" | |
| summarized_dir = project_root / "summarized-articles" | |
| audios_dir = project_root / "audios" | |
| safe_query = sanitize_query_folder(query) | |
| Log.header(f"NEWS-WHISPER PIPELINE [{language.upper()}]: Search '{query}'") | |
| pipeline_start = time.monotonic() | |
| step_times: dict = {} | |
| # ββ Step 1: Scrape ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 1/3: Web Scraping") | |
| ok, elapsed = run_module( | |
| scraper_script, | |
| [f"--{language}", "--search", query, "--pages", str(max(1, pages))], | |
| "Web Scraper", | |
| ) | |
| step_times["Scraping"] = elapsed | |
| if not ok: | |
| Log.error("Web scraping failed. Aborting.") | |
| return False | |
| scraped_dir = articles_dir / language / "search_queries" / safe_query | |
| latest_scraped_json = find_latest_json(scraped_dir) | |
| if not latest_scraped_json: | |
| Log.error(f"No scraped articles found in {scraped_dir}") | |
| return False | |
| Log.success(f"Scraped articles: {latest_scraped_json}") | |
| if language == "hindi": | |
| # ββ Step 2: Hindi Summarization βββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 2/3: Hindi Summarization (mT5 + Groq)") | |
| cmd_args = ["--file", str(latest_scraped_json)] | |
| if no_dedup: cmd_args.append("--no-dedup") | |
| ok, elapsed = run_module(hindi_summary_script, cmd_args, "Hindi Summarizer") | |
| step_times["Summarization"] = elapsed | |
| if not ok: | |
| Log.error("Hindi summarization failed.") | |
| return False | |
| summarized_dir_q = summarized_dir / language / "search_queries" / safe_query | |
| latest_summarized_json = find_latest_json(summarized_dir_q) | |
| if not latest_summarized_json: | |
| Log.error(f"No summarized articles found in {summarized_dir_q}") | |
| return False | |
| Log.success(f"Summarized articles: {latest_summarized_json}") | |
| # ββ Step 3: Hindi TTS βββββββββββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 3/3: Hindi TTS (gTTS + FFmpeg)") | |
| ok, elapsed = run_module( | |
| hindi_tts_script, | |
| ["--file", str(latest_summarized_json)], | |
| "Hindi TTS", | |
| ) | |
| step_times["TTS"] = elapsed | |
| if not ok: | |
| Log.error("Hindi TTS generation failed.") | |
| return False | |
| else: | |
| # ββ Step 2: English Summarization ββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 2/3: Article Summarization") | |
| cmd_args = ["--file", str(latest_scraped_json)] | |
| if no_dedup: cmd_args.append("--no-dedup") | |
| ok, elapsed = run_module(english_summary_script, cmd_args, "English Summarizer") | |
| step_times["Summarization"] = elapsed | |
| if not ok: | |
| Log.error("Summarization failed. Aborting.") | |
| return False | |
| summarized_dir_q = summarized_dir / language / "search_queries" / safe_query | |
| latest_summarized_json = find_latest_json(summarized_dir_q) | |
| if not latest_summarized_json: | |
| Log.error(f"No summarized articles found in {summarized_dir_q}") | |
| return False | |
| Log.success(f"Summarized articles: {latest_summarized_json}") | |
| # ββ Step 3: English TTS ββββββββββββββββββββββββββββββββββββββββββββββ | |
| Log.section("STEP 3/3: Text-to-Speech Generation") | |
| ok, elapsed = run_module( | |
| english_tts_script, | |
| ["--file", str(latest_summarized_json)], | |
| "English TTS", | |
| ) | |
| step_times["TTS"] = elapsed | |
| if not ok: | |
| Log.error("TTS generation failed.") | |
| return False | |
| # ββ Timing summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| total = time.monotonic() - pipeline_start | |
| Log.header("PIPELINE COMPLETED SUCCESSFULLY!") | |
| print(f"{Colors.BOLD} Timing breakdown:{Colors.RESET}") | |
| for step, t in step_times.items(): | |
| print(f" {Colors.GREEN}β’{Colors.RESET} {step:<25} {Colors.CYAN}{t:.1f}s{Colors.RESET}") | |
| print(f" {'β' * 35}") | |
| print(f" {Colors.BOLD}Total{Colors.RESET}{'':21} {Colors.BOLD}{Colors.CYAN}{total:.1f}s{Colors.RESET}\n") | |
| search_audio_dir = audios_dir / language / "search_queries" / safe_query | |
| if search_audio_dir.exists(): | |
| audio_files = list(search_audio_dir.glob("**/*.mp3")) + list(search_audio_dir.glob("**/*.wav")) | |
| Log.success(f"Generated {len(audio_files)} audio files") | |
| Log.info(f"Output directory: {Colors.CYAN}{search_audio_dir}{Colors.RESET}") | |
| return True | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Display helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def list_categories(language: str): | |
| categories = LANGUAGE_CATEGORIES[language] | |
| Log.header(f"Available Categories [{language.upper()}]") | |
| for key, name in categories.items(): | |
| print(f" {Colors.GREEN}β’{Colors.RESET} {Colors.BOLD}{key:15}{Colors.RESET} {name}") | |
| print(f"\n{Colors.DIM}Usage: python main.py --{language} --category <name>{Colors.RESET}\n") | |
| def show_usage(): | |
| print(f""" | |
| {Colors.BOLD}{Colors.CYAN}News-Whisper β Unified CLI{Colors.RESET} | |
| {Colors.BOLD}Description:{Colors.RESET} | |
| Complete pipeline: Scraping β Summarization β Text-to-Speech | |
| A language flag (--english or --hindi) is always required. | |
| {Colors.BOLD}Usage:{Colors.RESET} | |
| python main.py --english --list | |
| python main.py --english --category <name> | |
| python main.py --english --search "query" | |
| python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" | |
| python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" --pages 3 | |
| python main.py --hindi --list | |
| python main.py --hindi --category <name> | |
| {Colors.BOLD}Examples:{Colors.RESET} | |
| python main.py --english --category top | |
| python main.py --english --category sports | |
| python main.py --english --search "climate change" | |
| python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯" | |
| python main.py --english --search "pune" --pages 3 | |
| python main.py --hindi --category sports | |
| python main.py --hindi --category politics | |
| {Colors.BOLD}Notes:{Colors.RESET} | |
| β’ --search is available for both --english and --hindi | |
| β’ --pages / --page applies to search only and defaults to 1 | |
| β’ Hindi supports extra categories: politics, latest, world, crime | |
| β’ Add --no-dedup to skip checking the Supabase registry for previously processed articles | |
| {Colors.BOLD}Environment:{Colors.RESET} | |
| Device: {Colors.CYAN}{os.getenv('DEVICE', 'cpu').upper()}{Colors.RESET} Max Workers: {Colors.CYAN}{os.getenv('MAX_WORKERS', '4')}{Colors.RESET} | |
| """) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Entry point | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| language = None | |
| clean_args = [] | |
| for arg in sys.argv[1:]: | |
| key = arg.lstrip("-").lower() | |
| if key in LANGUAGE_CATEGORIES: | |
| language = key | |
| else: | |
| clean_args.append(arg) | |
| if not sys.argv[1:]: | |
| show_usage() | |
| sys.exit(0) | |
| if language is None: | |
| Log.error("A language flag is required: --english or --hindi") | |
| show_usage() | |
| sys.exit(1) | |
| parser = argparse.ArgumentParser(add_help=False) | |
| group = parser.add_mutually_exclusive_group() | |
| group.add_argument('--list', action='store_true') | |
| group.add_argument('--category', '-c', type=str) | |
| group.add_argument('--search', '-s', type=str, nargs='+') | |
| parser.add_argument('--pages', '--page', type=int, default=1, help="Number of search result pages to scan") | |
| parser.add_argument('--no-dedup', action='store_true', help="Skip Supabase deduplication") | |
| parser.add_argument('--help', '-h', action='store_true') | |
| args = parser.parse_args(clean_args) | |
| if args.help or not clean_args: | |
| show_usage() | |
| sys.exit(0) | |
| if args.pages < 1: | |
| Log.error("--pages must be at least 1") | |
| sys.exit(1) | |
| Log.info(f"Language : {Colors.CYAN}{language.upper()}{Colors.RESET}") | |
| Log.info(f"Device : {Colors.CYAN}{os.getenv('DEVICE', 'cpu').upper()}{Colors.RESET}") | |
| Log.info(f"Max Workers: {Colors.CYAN}{os.getenv('MAX_WORKERS', '4')}{Colors.RESET}") | |
| if args.list: | |
| list_categories(language) | |
| sys.exit(0) | |
| if args.category: | |
| category = args.category.lower() | |
| if category not in LANGUAGE_CATEGORIES[language]: | |
| Log.warning(f"'{category}' is not a predefined category for --{language}. Proceeding anyway.") | |
| success = process_category(language, category, no_dedup=args.no_dedup) | |
| sys.exit(0 if success else 1) | |
| if args.search: | |
| query = " ".join(args.search) | |
| success = process_search(language, query, no_dedup=args.no_dedup, pages=args.pages) | |
| sys.exit(0 if success else 1) | |
| show_usage() | |
| sys.exit(0) | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except KeyboardInterrupt: | |
| print(f"\n\n{Colors.YELLOW}Pipeline cancelled by user.{Colors.RESET}\n") | |
| sys.exit(0) | |
| except Exception as e: | |
| Log.error(f"Critical error: {str(e)}") | |
| raise | |