Spaces:

dev11-13
/

news-whisper-api

Running

news-whisper-api / backend /summarization /hindi_summary.py

Devang1290

feat: deploy News Whisper on-demand search API (FastAPI + Docker)

2cb327c 28 days ago

10.9 kB

	#!/usr/bin/env python3
	"""
	Hindi Article Summarizer
	=========================
	Summarizes Hindi news articles using a two-stage approach:

	Stage 1: Offline summarization with mT5_multilingual_XLSum (ONNX)
	Stage 2: Online polishing with Groq Llama 3.3 70B API

	This script handles ONLY the summarization part. TTS is handled separately
	by backend/text_to_speech/hindi_tts.py. Both are orchestrated by main.py.

	Prerequisites:
	- mT5 ONNX model at models/mt5_onnx/ (run: python backend/models/export_mt5.py)
	- GROQ_API_KEY in .env (free at https://console.groq.com)

	Usage:
	python backend/summarization/hindi_summary.py --file "articles/hindi/categories/sports/6_mar_7_00_pm.json"
	python backend/summarization/hindi_summary.py --file "..." --no-dedup
	"""

	import sys
	sys.stdout.reconfigure(encoding='utf-8')
	import os
	import json
	import time
	import argparse
	from pathlib import Path
	from datetime import datetime
	from dotenv import load_dotenv

	# Load .env from project root
	load_dotenv(Path(__file__).parent.parent.parent / '.env')

	# Suppress unnecessary warnings
	import logging
	import warnings
	warnings.filterwarnings('ignore', category=UserWarning)
	warnings.filterwarnings('ignore', category=FutureWarning)
	logging.getLogger("httpx").setLevel(logging.WARNING)

	try:
	from optimum.onnxruntime import ORTModelForSeq2SeqLM
	from transformers import AutoTokenizer
	except ImportError:
	print("\nError: Required packages not installed.")
	print("Please install optimum[onnxruntime] and transformers")
	sys.exit(1)

	try:
	from groq import Groq
	except ImportError:
	print("\nError: 'groq' package is not installed.")
	print("Run: pip install groq")
	sys.exit(1)

	# Add project root to path
	sys.path.append(str(Path(__file__).parent.parent.parent))
	from backend.common.colors import Colors, Log
	from backend.common.paths import get_project_root, get_timestamp_folder

	try:
	from backend.utils.db_utils import DatabaseManager
	HAS_DB = True
	except ImportError:
	HAS_DB = False


	# ─────────────────────────────────────────────
	# Output Layout Helpers
	# ─────────────────────────────────────────────

	def resolve_output_layout(input_file: Path, category_name: str):
	"""Determine output folder structure from input path.

	Preserves the incoming folder shape:
	- categories/<category>
	- search_queries/<query>
	"""
	parts = input_file.parts
	parent_folder = "categories"
	folder_name = category_name

	if "search_queries" in parts:
	idx = parts.index("search_queries")
	parent_folder = "search_queries"
	if idx + 1 < len(parts):
	folder_name = parts[idx + 1]
	elif "categories" in parts:
	idx = parts.index("categories")
	if idx + 1 < len(parts):
	folder_name = parts[idx + 1]

	return parent_folder, folder_name


	def build_output_dir(parent_folder: str, folder_name: str):
	"""Build and create the summarized output directory.

	Returns: summarized-articles/hindi/{categories\|search_queries}/<folder_name>/
	"""
	root = get_project_root()
	summ_dir = root / "summarized-articles" / "hindi" / parent_folder / folder_name
	summ_dir.mkdir(parents=True, exist_ok=True)
	return summ_dir


	# ─────────────────────────────────────────────
	# AI Functions
	# ─────────────────────────────────────────────

	def summarize_with_mt5(text: str, model, tokenizer) -> str:
	"""Run text through offline mT5_multilingual_XLSum ONNX model.

	Generates a raw Hindi summary. This is stage 1 — output is rough
	and gets polished by Groq in stage 2.
	"""
	inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
	outputs = model.generate(
	**inputs,
	max_length=350,
	min_length=120,
	num_beams=4,
	length_penalty=2.0,
	early_stopping=True
	)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)


	def polish_with_groq(api_key: str, raw_summary: str) -> str:
	"""Polish the raw mT5 summary via Groq API using Llama 3.3 70B.

	Rewrites the summary into a natural, smooth Hindi broadcast script.
	Falls back to raw summary if the API call fails.
	"""
	try:
	client = Groq(api_key=api_key)
	response = client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{
	"role": "system",
	"content": (
	"You are a Hindi news anchor. Rewrite the given summary into a natural, "
	"smooth 2-3 sentence broadcast script in Hindi. Use simple words. "
	"Write all numbers in Hindi words (e.g. दस, सात). "
	"Output ONLY the polished Hindi text, nothing else, no quotes."
	)
	},
	{"role": "user", "content": raw_summary}
	],
	temperature=0.3,
	max_tokens=500,
	)
	return response.choices[0].message.content.strip()
	except Exception as e:
	print(f" {Colors.RED}X Groq Polish failed: {e}{Colors.RESET}")
	return raw_summary


	# ─────────────────────────────────────────────
	# Main
	# ─────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(description="Hindi News Summarizer (mT5 ONNX + Groq)")
	parser.add_argument("--file", "-f", required=True, help="Path to the scraped JSON file")
	parser.add_argument("--no-dedup", action="store_true", help="Skip Supabase deduplication registry check")
	args = parser.parse_args()

	input_file = Path(args.file)
	if not input_file.exists():
	Log.error(f"File not found: {args.file}")
	sys.exit(1)

	with open(input_file, 'r', encoding='utf-8') as f:
	articles = json.load(f)

	if not articles:
	Log.warning(f"No articles found in {input_file.name}")
	sys.exit(0)

	category = articles[0].get("category", "unknown").replace(" ", "_").lower()
	parent_folder, folder_name = resolve_output_layout(input_file, category)

	# Ensure Groq Key exists
	api_key = os.environ.get("GROQ_API_KEY")
	if not api_key:
	Log.error("GROQ_API_KEY environment variable not found.")
	print("Please set it in .env or as an environment variable.")
	sys.exit(1)

	# Load ONNX Model
	model_dir = get_project_root() / "models" / "mt5_onnx"
	if not model_dir.exists():
	Log.error(f"mT5 model directory not found at {model_dir}")
	print(f"Run: python backend/models/export_mt5.py")
	sys.exit(1)

	print(f"\n{Colors.CYAN}Loading mT5 ONNX model and tokenizer...{Colors.RESET}", end=" ", flush=True)
	tokenizer = AutoTokenizer.from_pretrained(model_dir)
	model = ORTModelForSeq2SeqLM.from_pretrained(model_dir)
	print(f"{Colors.GREEN}V Loaded{Colors.RESET}\n")

	# Set up directories
	timestamp_folder = get_timestamp_folder()
	summ_dir = build_output_dir(parent_folder, folder_name)

	# Filter deduplicated articles
	total_scraped = len(articles)
	already_processed_count = 0

	if HAS_DB and not getattr(args, "no_dedup", False):
	db = DatabaseManager()
	article_ids = [a.get("id") for a in articles if a.get("id")]
	existing_ids = db.check_registry(article_ids)
	if existing_ids:
	already_processed_count = len(existing_ids)
	articles = [a for a in articles if a.get("id") not in existing_ids]

	total_articles = len(articles)

	print(f"\n{Colors.BOLD}{Colors.CYAN}--- Processing Funnel ---{Colors.RESET}")
	print(f"Total scraped : {total_scraped}")
	if getattr(args, "no_dedup", False):
	print(f"Deduplication : {Colors.YELLOW}DISABLED (--no-dedup){Colors.RESET}")
	else:
	print(f"Already processed : {already_processed_count}")
	print(f"New articles to do: {Colors.GREEN}{total_articles}{Colors.RESET}")
	print(f"{Colors.BOLD}{Colors.CYAN}-------------------------{Colors.RESET}\n")

	if total_articles == 0:
	Log.info("All articles in this batch have already been processed. Exiting.")
	sys.exit(0)

	processed_articles = []

	for idx, article in enumerate(articles, 1):
	article_id = article.get("id", f"unknown_{idx}")
	title = article.get("title", "No Title")
	content = article.get("content", "")

	try:
	safe_title = title[:50].encode(sys.stdout.encoding, errors='replace').decode(sys.stdout.encoding)
	except Exception:
	safe_title = "Hindi Article"

	print(f"{Colors.BOLD}[{idx}/{total_articles}] Article: {Colors.CYAN}{safe_title}...{Colors.RESET}")

	if not content:
	print(f" {Colors.YELLOW}⚠ Skipped (No content){Colors.RESET}\n")
	continue

	# Stage 1: mT5 Summarization
	print(f" → Summarizing with mT5 offline...", end=" ", flush=True)
	raw_summary = summarize_with_mt5(content, model, tokenizer)
	print(f"{Colors.GREEN}V{Colors.RESET}")

	# Stage 2: Groq Polishing
	print(f" → Polishing with Groq (Llama 3.3)...", end=" ", flush=True)
	polished_summary = polish_with_groq(api_key, raw_summary)
	print(f"{Colors.GREEN}V{Colors.RESET}")

	# Respect API rate limits
	time.sleep(3)

	# Update article
	article['raw_mt5_summary'] = raw_summary
	article['summary'] = polished_summary
	article['summarized'] = True
	article['summary_generated_at'] = datetime.now().isoformat()

	processed_articles.append(article)
	print("-" * 60)

	# Save Summarized JSON
	output_json_path = summ_dir / f"{timestamp_folder}.json"
	with open(output_json_path, 'w', encoding='utf-8') as f:
	json.dump(processed_articles, f, indent=2, ensure_ascii=False)

	print(f"\n{Colors.GREEN}{Colors.BOLD}Hindi Summarization Complete!{Colors.RESET}")
	print(f"Summarized JSON: {Colors.CYAN}{output_json_path.relative_to(get_project_root())}{Colors.RESET}\n")


	if __name__ == "__main__":
	try:
	main()
	except KeyboardInterrupt:
	print(f"\n\n{Colors.YELLOW}Summarization cancelled by user{Colors.RESET}\n")
	sys.exit(0)
	except Exception as e:
	Log.error(f"Critical error: {str(e)}")
	raise