Spaces:

dev11-13
/

news-whisper-api

Sleeping

news-whisper-api / backend /summarize_articles /summarize.py

Devang1290

feat: deploy News Whisper on-demand search API (FastAPI + Docker)

2cb327c 24 days ago

6.81 kB

	"""
	News Summarization Module
	Provides a clean, symmetric pipeline function `summarize_articles(articles, language)`
	for both English (Transformers) and Hindi (mT5 + Groq).
	"""

	from typing import List, Dict, Optional
	import time
	from datetime import datetime, timezone
	import sys
	from pathlib import Path

	# Add backend to path to resolve core modules
	sys.path.append(str(Path(__file__).resolve().parent.parent))

	from core.logger import logger
	from core.config import config
	from summarize_articles.utils import clean_text, should_summarize
	from summarize_articles.model import get_summarizer

	# For Hindi Summarization
	try:
	from optimum.onnxruntime import ORTModelForSeq2SeqLM
	from transformers import AutoTokenizer
	from groq import Groq
	except ImportError:
	pass


	# ─────────────────────────────────────────────
	# Lazy Model Loading
	# ─────────────────────────────────────────────

	_english_summarizer = None
	_hindi_model = None
	_hindi_tokenizer = None
	_groq_client = None


	def load_english_model():
	global _english_summarizer
	if _english_summarizer is None:
	logger.info("Loading English Summarization Model...")
	_english_summarizer = get_summarizer()


	def load_hindi_models():
	global _hindi_model, _hindi_tokenizer, _groq_client
	if _hindi_model is None:
	model_dir = config.MODELS_DIR / "mt5_onnx"
	if not model_dir.exists():
	raise FileNotFoundError(f"mT5 ONNX model missing at {model_dir}. Run: python backend/export_mt5.py")

	logger.info("Loading Hindi mT5 ONNX model and tokenizer...")
	_hindi_tokenizer = AutoTokenizer.from_pretrained(model_dir)
	_hindi_model = ORTModelForSeq2SeqLM.from_pretrained(model_dir)

	if _groq_client is None:
	if not config.GROQ_API_KEY:
	raise ValueError("GROQ_API_KEY is not set in environment or .env.")
	logger.info("Initializing Groq Client...")
	_groq_client = Groq(api_key=config.GROQ_API_KEY)


	# ─────────────────────────────────────────────
	# Language-Specific Summarization Logic
	# ─────────────────────────────────────────────

	def _summarize_english(text: str) -> str:
	load_english_model()
	return _english_summarizer.summarize(text, max_words=config.MAX_SUMMARY_WORDS, language="english")


	def _summarize_hindi(text: str) -> str:
	load_hindi_models()

	# 1. mT5 Raw Summary
	inputs = _hindi_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
	outputs = _hindi_model.generate(
	**inputs,
	max_length=350,
	min_length=120,
	num_beams=4,
	length_penalty=2.0,
	early_stopping=True
	)
	raw_summary = _hindi_tokenizer.decode(outputs[0], skip_special_tokens=True)

	# 2. Groq Polish
	try:
	response = _groq_client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{
	"role": "system",
	"content": (
	"You are a Hindi news anchor. Rewrite the given summary into a natural, "
	"smooth 2-3 sentence broadcast script in Hindi. Use simple words. "
	"Write all numbers in Hindi words (e.g. दस, सात). "
	"Output ONLY the polished Hindi text, nothing else, no quotes."
	)
	},
	{"role": "user", "content": raw_summary}
	],
	temperature=0.3,
	max_tokens=500,
	)
	time.sleep(3) # Rate Limit buffer for Groq free tier
	return response.choices[0].message.content.strip()
	except Exception as e:
	logger.error(f"Groq Polish failed: {e}")
	return raw_summary


	def process_article(article: Dict, language: str) -> Optional[Dict]:
	content = article.get('content', '')
	if not content:
	logger.warning(f"Skipped {article.get('id', 'unknown')} (No content)")
	return article

	cleaned_content = clean_text(content)

	if not should_summarize(cleaned_content):
	article['summary'] = cleaned_content
	article['summarized'] = False
	return article

	try:
	if language == "hindi":
	summary = _summarize_hindi(cleaned_content)
	else:
	summary = _summarize_english(cleaned_content)

	article['summary'] = summary
	article['summarized'] = True
	article['summary_generated_at'] = datetime.now(timezone.utc).isoformat()
	return article
	except Exception as e:
	logger.error(f"Summarization failed for {article.get('id', 'unknown')}: {e}")
	return article


	# ─────────────────────────────────────────────
	# Public API
	# ─────────────────────────────────────────────

	def summarize_articles(articles: List[Dict], language: str) -> List[Dict]:
	"""
	Summarizes a list of articles using standard NLP models for English and mT5/Groq for Hindi.
	Does not perform any disk I/O, just returns the summarized in-memory items.

	Args:
	articles: List of dictionary records containing 'content' keys.
	language: 'english' or 'hindi'

	Returns:
	List of dictionary records with the added 'summary' field.
	"""
	language = language.lower()
	logger.info(f"Starting {language.upper()} Summarization pipeline for {len(articles)} articles.")

	if not articles:
	return []

	processed = []

	# Sequential execution to avoid thread-related GPU context errors and Groq rate limits
	for idx, article in enumerate(articles, 1):
	try:
	safe_title = article.get("title", "No Title")[:50].encode(sys.stdout.encoding, errors='replace').decode(sys.stdout.encoding)
	except:
	safe_title = "Article"

	logger.info(f"[{idx}/{len(articles)}] Summarizing: {safe_title}...")

	updated_article = process_article(article, language)
	processed.append(updated_article)

	actual_summaries = sum(1 for a in processed if a.get('summarized', False))
	logger.success(f"Summarized {actual_summaries}/{len(articles)} articles.")
	return processed