news-whisper-api / backend /summarization /hindi_summary.py
Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
#!/usr/bin/env python3
"""
Hindi Article Summarizer
=========================
Summarizes Hindi news articles using a two-stage approach:
Stage 1: Offline summarization with mT5_multilingual_XLSum (ONNX)
Stage 2: Online polishing with Groq Llama 3.3 70B API
This script handles ONLY the summarization part. TTS is handled separately
by backend/text_to_speech/hindi_tts.py. Both are orchestrated by main.py.
Prerequisites:
- mT5 ONNX model at models/mt5_onnx/ (run: python backend/models/export_mt5.py)
- GROQ_API_KEY in .env (free at https://console.groq.com)
Usage:
python backend/summarization/hindi_summary.py --file "articles/hindi/categories/sports/6_mar_7_00_pm.json"
python backend/summarization/hindi_summary.py --file "..." --no-dedup
"""
import sys
sys.stdout.reconfigure(encoding='utf-8')
import os
import json
import time
import argparse
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
# Load .env from project root
load_dotenv(Path(__file__).parent.parent.parent / '.env')
# Suppress unnecessary warnings
import logging
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
logging.getLogger("httpx").setLevel(logging.WARNING)
try:
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from transformers import AutoTokenizer
except ImportError:
print("\nError: Required packages not installed.")
print("Please install optimum[onnxruntime] and transformers")
sys.exit(1)
try:
from groq import Groq
except ImportError:
print("\nError: 'groq' package is not installed.")
print("Run: pip install groq")
sys.exit(1)
# Add project root to path
sys.path.append(str(Path(__file__).parent.parent.parent))
from backend.common.colors import Colors, Log
from backend.common.paths import get_project_root, get_timestamp_folder
try:
from backend.utils.db_utils import DatabaseManager
HAS_DB = True
except ImportError:
HAS_DB = False
# ─────────────────────────────────────────────
# Output Layout Helpers
# ─────────────────────────────────────────────
def resolve_output_layout(input_file: Path, category_name: str):
"""Determine output folder structure from input path.
Preserves the incoming folder shape:
- categories/<category>
- search_queries/<query>
"""
parts = input_file.parts
parent_folder = "categories"
folder_name = category_name
if "search_queries" in parts:
idx = parts.index("search_queries")
parent_folder = "search_queries"
if idx + 1 < len(parts):
folder_name = parts[idx + 1]
elif "categories" in parts:
idx = parts.index("categories")
if idx + 1 < len(parts):
folder_name = parts[idx + 1]
return parent_folder, folder_name
def build_output_dir(parent_folder: str, folder_name: str):
"""Build and create the summarized output directory.
Returns: summarized-articles/hindi/{categories|search_queries}/<folder_name>/
"""
root = get_project_root()
summ_dir = root / "summarized-articles" / "hindi" / parent_folder / folder_name
summ_dir.mkdir(parents=True, exist_ok=True)
return summ_dir
# ─────────────────────────────────────────────
# AI Functions
# ─────────────────────────────────────────────
def summarize_with_mt5(text: str, model, tokenizer) -> str:
"""Run text through offline mT5_multilingual_XLSum ONNX model.
Generates a raw Hindi summary. This is stage 1 β€” output is rough
and gets polished by Groq in stage 2.
"""
inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
**inputs,
max_length=350,
min_length=120,
num_beams=4,
length_penalty=2.0,
early_stopping=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def polish_with_groq(api_key: str, raw_summary: str) -> str:
"""Polish the raw mT5 summary via Groq API using Llama 3.3 70B.
Rewrites the summary into a natural, smooth Hindi broadcast script.
Falls back to raw summary if the API call fails.
"""
try:
client = Groq(api_key=api_key)
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{
"role": "system",
"content": (
"You are a Hindi news anchor. Rewrite the given summary into a natural, "
"smooth 2-3 sentence broadcast script in Hindi. Use simple words. "
"Write all numbers in Hindi words (e.g. ΰ€¦ΰ€Έ, ΰ€Έΰ€Ύΰ€€). "
"Output ONLY the polished Hindi text, nothing else, no quotes."
)
},
{"role": "user", "content": raw_summary}
],
temperature=0.3,
max_tokens=500,
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f" {Colors.RED}X Groq Polish failed: {e}{Colors.RESET}")
return raw_summary
# ─────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Hindi News Summarizer (mT5 ONNX + Groq)")
parser.add_argument("--file", "-f", required=True, help="Path to the scraped JSON file")
parser.add_argument("--no-dedup", action="store_true", help="Skip Supabase deduplication registry check")
args = parser.parse_args()
input_file = Path(args.file)
if not input_file.exists():
Log.error(f"File not found: {args.file}")
sys.exit(1)
with open(input_file, 'r', encoding='utf-8') as f:
articles = json.load(f)
if not articles:
Log.warning(f"No articles found in {input_file.name}")
sys.exit(0)
category = articles[0].get("category", "unknown").replace(" ", "_").lower()
parent_folder, folder_name = resolve_output_layout(input_file, category)
# Ensure Groq Key exists
api_key = os.environ.get("GROQ_API_KEY")
if not api_key:
Log.error("GROQ_API_KEY environment variable not found.")
print("Please set it in .env or as an environment variable.")
sys.exit(1)
# Load ONNX Model
model_dir = get_project_root() / "models" / "mt5_onnx"
if not model_dir.exists():
Log.error(f"mT5 model directory not found at {model_dir}")
print(f"Run: python backend/models/export_mt5.py")
sys.exit(1)
print(f"\n{Colors.CYAN}Loading mT5 ONNX model and tokenizer...{Colors.RESET}", end=" ", flush=True)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = ORTModelForSeq2SeqLM.from_pretrained(model_dir)
print(f"{Colors.GREEN}V Loaded{Colors.RESET}\n")
# Set up directories
timestamp_folder = get_timestamp_folder()
summ_dir = build_output_dir(parent_folder, folder_name)
# Filter deduplicated articles
total_scraped = len(articles)
already_processed_count = 0
if HAS_DB and not getattr(args, "no_dedup", False):
db = DatabaseManager()
article_ids = [a.get("id") for a in articles if a.get("id")]
existing_ids = db.check_registry(article_ids)
if existing_ids:
already_processed_count = len(existing_ids)
articles = [a for a in articles if a.get("id") not in existing_ids]
total_articles = len(articles)
print(f"\n{Colors.BOLD}{Colors.CYAN}--- Processing Funnel ---{Colors.RESET}")
print(f"Total scraped : {total_scraped}")
if getattr(args, "no_dedup", False):
print(f"Deduplication : {Colors.YELLOW}DISABLED (--no-dedup){Colors.RESET}")
else:
print(f"Already processed : {already_processed_count}")
print(f"New articles to do: {Colors.GREEN}{total_articles}{Colors.RESET}")
print(f"{Colors.BOLD}{Colors.CYAN}-------------------------{Colors.RESET}\n")
if total_articles == 0:
Log.info("All articles in this batch have already been processed. Exiting.")
sys.exit(0)
processed_articles = []
for idx, article in enumerate(articles, 1):
article_id = article.get("id", f"unknown_{idx}")
title = article.get("title", "No Title")
content = article.get("content", "")
try:
safe_title = title[:50].encode(sys.stdout.encoding, errors='replace').decode(sys.stdout.encoding)
except Exception:
safe_title = "Hindi Article"
print(f"{Colors.BOLD}[{idx}/{total_articles}] Article: {Colors.CYAN}{safe_title}...{Colors.RESET}")
if not content:
print(f" {Colors.YELLOW}⚠ Skipped (No content){Colors.RESET}\n")
continue
# Stage 1: mT5 Summarization
print(f" β†’ Summarizing with mT5 offline...", end=" ", flush=True)
raw_summary = summarize_with_mt5(content, model, tokenizer)
print(f"{Colors.GREEN}V{Colors.RESET}")
# Stage 2: Groq Polishing
print(f" β†’ Polishing with Groq (Llama 3.3)...", end=" ", flush=True)
polished_summary = polish_with_groq(api_key, raw_summary)
print(f"{Colors.GREEN}V{Colors.RESET}")
# Respect API rate limits
time.sleep(3)
# Update article
article['raw_mt5_summary'] = raw_summary
article['summary'] = polished_summary
article['summarized'] = True
article['summary_generated_at'] = datetime.now().isoformat()
processed_articles.append(article)
print("-" * 60)
# Save Summarized JSON
output_json_path = summ_dir / f"{timestamp_folder}.json"
with open(output_json_path, 'w', encoding='utf-8') as f:
json.dump(processed_articles, f, indent=2, ensure_ascii=False)
print(f"\n{Colors.GREEN}{Colors.BOLD}Hindi Summarization Complete!{Colors.RESET}")
print(f"Summarized JSON: {Colors.CYAN}{output_json_path.relative_to(get_project_root())}{Colors.RESET}\n")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(f"\n\n{Colors.YELLOW}Summarization cancelled by user{Colors.RESET}\n")
sys.exit(0)
except Exception as e:
Log.error(f"Critical error: {str(e)}")
raise