news-whisper-api / main.py
Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
#!/usr/bin/env python3
"""
News-Whisper β€” Unified CLI
============================
Orchestrates the complete pipeline: Scraping β†’ Summarization β†’ Text-to-Speech
This is the main entry point. It dispatches to the correct language-specific
modules based on the --english or --hindi flag.
Usage:
python main.py --english --category top # Process top news (English)
python main.py --english --search "climate" # Search and process (English)
python main.py --hindi --category sports # Process sports (Hindi)
python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡" # Search and process (Hindi)
python main.py --english --list # List English categories
python main.py --hindi --list # List Hindi categories
Pipeline Flow:
English: news_scrape.py β†’ english_summary.py β†’ english_tts.py
Hindi: news_scrape.py β†’ hindi_summary.py β†’ hindi_tts.py
"""
import sys
import os
sys.stdout.reconfigure(encoding='utf-8')
import time
import argparse
from pathlib import Path
from typing import Tuple
from dotenv import load_dotenv
load_dotenv()
# Import shared utilities
from backend.common.colors import Colors, Log
from backend.common.paths import (
get_project_root, sanitize_query_folder,
find_latest_json, find_latest_audio_dir
)
import subprocess
# ─────────────────────────────────────────────
# Language / category definitions
# ─────────────────────────────────────────────
LANGUAGE_CATEGORIES = {
"english": {
"top": "Top News",
"business": "Business",
"entertainment": "Entertainment",
"sports": "Sports",
"lifestyle": "Lifestyle",
"technology": "Technology",
"elections": "Elections",
},
"hindi": {
"top": "Top News",
"entertainment": "Entertainment",
"sports": "Sports",
"politics": "Politics",
"latest": "Latest News",
"technology": "Technology",
"lifestyle": "Lifestyle",
"business": "Business",
"world": "World News",
"crime": "Crime",
},
}
SEARCH_SUPPORTED = {"english", "hindi"}
# ─────────────────────────────────────────────
# Subprocess runner
# ─────────────────────────────────────────────
def run_module(script_path: Path, args: list, module_name: str) -> Tuple[bool, float]:
"""Run a backend script as a subprocess. Returns (success, elapsed_seconds)."""
t0 = time.monotonic()
try:
Log.info(f"Running {module_name}…")
subprocess.run(
[sys.executable, str(script_path)] + args,
check=True,
)
elapsed = time.monotonic() - t0
Log.info(f"{module_name} finished in {Colors.CYAN}{elapsed:.1f}s{Colors.RESET}")
return True, elapsed
except subprocess.CalledProcessError as e:
elapsed = time.monotonic() - t0
Log.error(f"{module_name} failed after {elapsed:.1f}s (exit code {e.returncode})")
return False, elapsed
except Exception as e:
Log.error(f"Error running {module_name}: {e}")
return False, 0.0
# ─────────────────────────────────────────────
# Pipeline: category
# ─────────────────────────────────────────────
def process_category(language: str, category: str, no_dedup: bool = False) -> bool:
project_root = get_project_root()
# New modular script paths
scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
english_summary_script = project_root / "backend" / "summarization" / "english_summary.py"
hindi_summary_script = project_root / "backend" / "summarization" / "hindi_summary.py"
english_tts_script = project_root / "backend" / "text_to_speech" / "english_tts.py"
hindi_tts_script = project_root / "backend" / "text_to_speech" / "hindi_tts.py"
articles_dir = project_root / "articles"
summarized_dir = project_root / "summarized-articles"
audios_dir = project_root / "audios"
categories = LANGUAGE_CATEGORIES[language]
category_name = categories.get(category, category)
Log.header(f"NEWS-WHISPER PIPELINE [{language.upper()}]: {category_name}")
pipeline_start = time.monotonic()
step_times: dict = {}
# ── Step 1: Scrape ────────────────────────────────────────────────────────
Log.section("STEP 1: Web Scraping")
ok, elapsed = run_module(
scraper_script,
[f"--{language}", "--category", category],
"Web Scraper",
)
step_times["Scraping"] = elapsed
if not ok:
Log.error("Web scraping failed. Aborting.")
return False
scraped_dir = articles_dir / language / "categories" / category_name.lower().replace(" ", "_")
latest_scraped_json = find_latest_json(scraped_dir)
if not latest_scraped_json:
Log.error(f"No scraped articles found in {scraped_dir}")
return False
Log.success(f"Scraped articles: {latest_scraped_json}")
# ── Branch: Language Pipelines ────────────────────────────────────────────
if language == "hindi":
# ── Step 2: Hindi Summarization ────────────────────────────────────────
Log.section("STEP 2: Hindi Summarization (mT5 + Groq)")
cmd_args = ["--file", str(latest_scraped_json)]
if no_dedup: cmd_args.append("--no-dedup")
ok, elapsed = run_module(hindi_summary_script, cmd_args, "Hindi Summarizer")
step_times["Summarization"] = elapsed
if not ok:
Log.error("Hindi summarization failed.")
return False
# Find the summarized JSON
summarized_dir_cat = summarized_dir / language / "categories" / category_name.lower().replace(" ", "_")
latest_summarized_json = find_latest_json(summarized_dir_cat)
if not latest_summarized_json:
Log.error(f"No summarized articles found in {summarized_dir_cat}")
return False
Log.success(f"Summarized articles: {latest_summarized_json}")
# ── Step 3: Hindi TTS ──────────────────────────────────────────────────
Log.section("STEP 3: Hindi TTS (gTTS + FFmpeg)")
ok, elapsed = run_module(
hindi_tts_script,
["--file", str(latest_summarized_json)],
"Hindi TTS",
)
step_times["TTS"] = elapsed
if not ok:
Log.error("Hindi TTS generation failed.")
return False
else:
# ── Step 2: English Summarization ──────────────────────────────────────
Log.section("STEP 2: Article Summarization")
cmd_args = ["--file", str(latest_scraped_json)]
if no_dedup: cmd_args.append("--no-dedup")
ok, elapsed = run_module(english_summary_script, cmd_args, "English Summarizer")
step_times["Summarization"] = elapsed
if not ok:
Log.error("Summarization failed. Aborting.")
return False
summarized_dir_cat = summarized_dir / language / "categories" / category_name.lower().replace(" ", "_")
latest_summarized_json = find_latest_json(summarized_dir_cat)
if not latest_summarized_json:
Log.error(f"No summarized articles found in {summarized_dir_cat}")
return False
Log.success(f"Summarized articles: {latest_summarized_json}")
# ── Step 3: English TTS ────────────────────────────────────────────────
Log.section("STEP 3: Text-to-Speech Generation")
ok, elapsed = run_module(
english_tts_script,
["--file", str(latest_summarized_json)],
"English TTS",
)
step_times["TTS"] = elapsed
if not ok:
Log.error("TTS generation failed.")
return False
# ── Timing summary ────────────────────────────────────────────────────────
total = time.monotonic() - pipeline_start
Log.header("PIPELINE COMPLETED SUCCESSFULLY!")
print(f"{Colors.BOLD} Timing breakdown:{Colors.RESET}")
for step, t in step_times.items():
print(f" {Colors.GREEN}β€’{Colors.RESET} {step:<25} {Colors.CYAN}{t:.1f}s{Colors.RESET}")
print(f" {'─' * 35}")
print(f" {Colors.BOLD}Total{Colors.RESET}{'':21} {Colors.BOLD}{Colors.CYAN}{total:.1f}s{Colors.RESET}\n")
latest_audio_dir = find_latest_audio_dir(audios_dir / language, category_name)
if latest_audio_dir:
audio_files = list(latest_audio_dir.glob("*.mp3")) + list(latest_audio_dir.glob("*.wav"))
Log.success(f"Generated {len(audio_files)} audio files")
Log.info(f"Output directory: {Colors.CYAN}{latest_audio_dir}{Colors.RESET}")
return True
# ─────────────────────────────────────────────
# Pipeline: search
# ─────────────────────────────────────────────
def process_search(language: str, query: str, no_dedup: bool = False, pages: int = 1) -> bool:
if language not in SEARCH_SUPPORTED:
Log.error(f"--search is not supported for --{language}")
return False
project_root = get_project_root()
scraper_script = project_root / "backend" / "web_scraping" / "news_scrape.py"
english_summary_script = project_root / "backend" / "summarization" / "english_summary.py"
hindi_summary_script = project_root / "backend" / "summarization" / "hindi_summary.py"
english_tts_script = project_root / "backend" / "text_to_speech" / "english_tts.py"
hindi_tts_script = project_root / "backend" / "text_to_speech" / "hindi_tts.py"
articles_dir = project_root / "articles"
summarized_dir = project_root / "summarized-articles"
audios_dir = project_root / "audios"
safe_query = sanitize_query_folder(query)
Log.header(f"NEWS-WHISPER PIPELINE [{language.upper()}]: Search '{query}'")
pipeline_start = time.monotonic()
step_times: dict = {}
# ── Step 1: Scrape ────────────────────────────────────────────────────────
Log.section("STEP 1/3: Web Scraping")
ok, elapsed = run_module(
scraper_script,
[f"--{language}", "--search", query, "--pages", str(max(1, pages))],
"Web Scraper",
)
step_times["Scraping"] = elapsed
if not ok:
Log.error("Web scraping failed. Aborting.")
return False
scraped_dir = articles_dir / language / "search_queries" / safe_query
latest_scraped_json = find_latest_json(scraped_dir)
if not latest_scraped_json:
Log.error(f"No scraped articles found in {scraped_dir}")
return False
Log.success(f"Scraped articles: {latest_scraped_json}")
if language == "hindi":
# ── Step 2: Hindi Summarization ─────────────────────────────────────
Log.section("STEP 2/3: Hindi Summarization (mT5 + Groq)")
cmd_args = ["--file", str(latest_scraped_json)]
if no_dedup: cmd_args.append("--no-dedup")
ok, elapsed = run_module(hindi_summary_script, cmd_args, "Hindi Summarizer")
step_times["Summarization"] = elapsed
if not ok:
Log.error("Hindi summarization failed.")
return False
summarized_dir_q = summarized_dir / language / "search_queries" / safe_query
latest_summarized_json = find_latest_json(summarized_dir_q)
if not latest_summarized_json:
Log.error(f"No summarized articles found in {summarized_dir_q}")
return False
Log.success(f"Summarized articles: {latest_summarized_json}")
# ── Step 3: Hindi TTS ───────────────────────────────────────────────
Log.section("STEP 3/3: Hindi TTS (gTTS + FFmpeg)")
ok, elapsed = run_module(
hindi_tts_script,
["--file", str(latest_summarized_json)],
"Hindi TTS",
)
step_times["TTS"] = elapsed
if not ok:
Log.error("Hindi TTS generation failed.")
return False
else:
# ── Step 2: English Summarization ────────────────────────────────────
Log.section("STEP 2/3: Article Summarization")
cmd_args = ["--file", str(latest_scraped_json)]
if no_dedup: cmd_args.append("--no-dedup")
ok, elapsed = run_module(english_summary_script, cmd_args, "English Summarizer")
step_times["Summarization"] = elapsed
if not ok:
Log.error("Summarization failed. Aborting.")
return False
summarized_dir_q = summarized_dir / language / "search_queries" / safe_query
latest_summarized_json = find_latest_json(summarized_dir_q)
if not latest_summarized_json:
Log.error(f"No summarized articles found in {summarized_dir_q}")
return False
Log.success(f"Summarized articles: {latest_summarized_json}")
# ── Step 3: English TTS ──────────────────────────────────────────────
Log.section("STEP 3/3: Text-to-Speech Generation")
ok, elapsed = run_module(
english_tts_script,
["--file", str(latest_summarized_json)],
"English TTS",
)
step_times["TTS"] = elapsed
if not ok:
Log.error("TTS generation failed.")
return False
# ── Timing summary ────────────────────────────────────────────────────────
total = time.monotonic() - pipeline_start
Log.header("PIPELINE COMPLETED SUCCESSFULLY!")
print(f"{Colors.BOLD} Timing breakdown:{Colors.RESET}")
for step, t in step_times.items():
print(f" {Colors.GREEN}β€’{Colors.RESET} {step:<25} {Colors.CYAN}{t:.1f}s{Colors.RESET}")
print(f" {'─' * 35}")
print(f" {Colors.BOLD}Total{Colors.RESET}{'':21} {Colors.BOLD}{Colors.CYAN}{total:.1f}s{Colors.RESET}\n")
search_audio_dir = audios_dir / language / "search_queries" / safe_query
if search_audio_dir.exists():
audio_files = list(search_audio_dir.glob("**/*.mp3")) + list(search_audio_dir.glob("**/*.wav"))
Log.success(f"Generated {len(audio_files)} audio files")
Log.info(f"Output directory: {Colors.CYAN}{search_audio_dir}{Colors.RESET}")
return True
# ─────────────────────────────────────────────
# Display helpers
# ─────────────────────────────────────────────
def list_categories(language: str):
categories = LANGUAGE_CATEGORIES[language]
Log.header(f"Available Categories [{language.upper()}]")
for key, name in categories.items():
print(f" {Colors.GREEN}β€’{Colors.RESET} {Colors.BOLD}{key:15}{Colors.RESET} {name}")
print(f"\n{Colors.DIM}Usage: python main.py --{language} --category <name>{Colors.RESET}\n")
def show_usage():
print(f"""
{Colors.BOLD}{Colors.CYAN}News-Whisper β€” Unified CLI{Colors.RESET}
{Colors.BOLD}Description:{Colors.RESET}
Complete pipeline: Scraping β†’ Summarization β†’ Text-to-Speech
A language flag (--english or --hindi) is always required.
{Colors.BOLD}Usage:{Colors.RESET}
python main.py --english --list
python main.py --english --category <name>
python main.py --english --search "query"
python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡"
python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡" --pages 3
python main.py --hindi --list
python main.py --hindi --category <name>
{Colors.BOLD}Examples:{Colors.RESET}
python main.py --english --category top
python main.py --english --category sports
python main.py --english --search "climate change"
python main.py --hindi --search "ΰ€ͺΰ₯ΰ€£ΰ₯‡"
python main.py --english --search "pune" --pages 3
python main.py --hindi --category sports
python main.py --hindi --category politics
{Colors.BOLD}Notes:{Colors.RESET}
β€’ --search is available for both --english and --hindi
β€’ --pages / --page applies to search only and defaults to 1
β€’ Hindi supports extra categories: politics, latest, world, crime
β€’ Add --no-dedup to skip checking the Supabase registry for previously processed articles
{Colors.BOLD}Environment:{Colors.RESET}
Device: {Colors.CYAN}{os.getenv('DEVICE', 'cpu').upper()}{Colors.RESET} Max Workers: {Colors.CYAN}{os.getenv('MAX_WORKERS', '4')}{Colors.RESET}
""")
# ─────────────────────────────────────────────
# Entry point
# ─────────────────────────────────────────────
def main():
language = None
clean_args = []
for arg in sys.argv[1:]:
key = arg.lstrip("-").lower()
if key in LANGUAGE_CATEGORIES:
language = key
else:
clean_args.append(arg)
if not sys.argv[1:]:
show_usage()
sys.exit(0)
if language is None:
Log.error("A language flag is required: --english or --hindi")
show_usage()
sys.exit(1)
parser = argparse.ArgumentParser(add_help=False)
group = parser.add_mutually_exclusive_group()
group.add_argument('--list', action='store_true')
group.add_argument('--category', '-c', type=str)
group.add_argument('--search', '-s', type=str, nargs='+')
parser.add_argument('--pages', '--page', type=int, default=1, help="Number of search result pages to scan")
parser.add_argument('--no-dedup', action='store_true', help="Skip Supabase deduplication")
parser.add_argument('--help', '-h', action='store_true')
args = parser.parse_args(clean_args)
if args.help or not clean_args:
show_usage()
sys.exit(0)
if args.pages < 1:
Log.error("--pages must be at least 1")
sys.exit(1)
Log.info(f"Language : {Colors.CYAN}{language.upper()}{Colors.RESET}")
Log.info(f"Device : {Colors.CYAN}{os.getenv('DEVICE', 'cpu').upper()}{Colors.RESET}")
Log.info(f"Max Workers: {Colors.CYAN}{os.getenv('MAX_WORKERS', '4')}{Colors.RESET}")
if args.list:
list_categories(language)
sys.exit(0)
if args.category:
category = args.category.lower()
if category not in LANGUAGE_CATEGORIES[language]:
Log.warning(f"'{category}' is not a predefined category for --{language}. Proceeding anyway.")
success = process_category(language, category, no_dedup=args.no_dedup)
sys.exit(0 if success else 1)
if args.search:
query = " ".join(args.search)
success = process_search(language, query, no_dedup=args.no_dedup, pages=args.pages)
sys.exit(0 if success else 1)
show_usage()
sys.exit(0)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(f"\n\n{Colors.YELLOW}Pipeline cancelled by user.{Colors.RESET}\n")
sys.exit(0)
except Exception as e:
Log.error(f"Critical error: {str(e)}")
raise