Devang1290
feat: deploy News Whisper on-demand search API (FastAPI + Docker)
2cb327c
"""
News Summarization Module
Provides a clean, symmetric pipeline function `summarize_articles(articles, language)`
for both English (Transformers) and Hindi (mT5 + Groq).
"""
from typing import List, Dict, Optional
import time
from datetime import datetime, timezone
import sys
from pathlib import Path
# Add backend to path to resolve core modules
sys.path.append(str(Path(__file__).resolve().parent.parent))
from core.logger import logger
from core.config import config
from summarize_articles.utils import clean_text, should_summarize
from summarize_articles.model import get_summarizer
# For Hindi Summarization
try:
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from transformers import AutoTokenizer
from groq import Groq
except ImportError:
pass
# ─────────────────────────────────────────────
# Lazy Model Loading
# ─────────────────────────────────────────────
_english_summarizer = None
_hindi_model = None
_hindi_tokenizer = None
_groq_client = None
def load_english_model():
global _english_summarizer
if _english_summarizer is None:
logger.info("Loading English Summarization Model...")
_english_summarizer = get_summarizer()
def load_hindi_models():
global _hindi_model, _hindi_tokenizer, _groq_client
if _hindi_model is None:
model_dir = config.MODELS_DIR / "mt5_onnx"
if not model_dir.exists():
raise FileNotFoundError(f"mT5 ONNX model missing at {model_dir}. Run: python backend/export_mt5.py")
logger.info("Loading Hindi mT5 ONNX model and tokenizer...")
_hindi_tokenizer = AutoTokenizer.from_pretrained(model_dir)
_hindi_model = ORTModelForSeq2SeqLM.from_pretrained(model_dir)
if _groq_client is None:
if not config.GROQ_API_KEY:
raise ValueError("GROQ_API_KEY is not set in environment or .env.")
logger.info("Initializing Groq Client...")
_groq_client = Groq(api_key=config.GROQ_API_KEY)
# ─────────────────────────────────────────────
# Language-Specific Summarization Logic
# ─────────────────────────────────────────────
def _summarize_english(text: str) -> str:
load_english_model()
return _english_summarizer.summarize(text, max_words=config.MAX_SUMMARY_WORDS, language="english")
def _summarize_hindi(text: str) -> str:
load_hindi_models()
# 1. mT5 Raw Summary
inputs = _hindi_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
outputs = _hindi_model.generate(
**inputs,
max_length=350,
min_length=120,
num_beams=4,
length_penalty=2.0,
early_stopping=True
)
raw_summary = _hindi_tokenizer.decode(outputs[0], skip_special_tokens=True)
# 2. Groq Polish
try:
response = _groq_client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{
"role": "system",
"content": (
"You are a Hindi news anchor. Rewrite the given summary into a natural, "
"smooth 2-3 sentence broadcast script in Hindi. Use simple words. "
"Write all numbers in Hindi words (e.g. ΰ€¦ΰ€Έ, ΰ€Έΰ€Ύΰ€€). "
"Output ONLY the polished Hindi text, nothing else, no quotes."
)
},
{"role": "user", "content": raw_summary}
],
temperature=0.3,
max_tokens=500,
)
time.sleep(3) # Rate Limit buffer for Groq free tier
return response.choices[0].message.content.strip()
except Exception as e:
logger.error(f"Groq Polish failed: {e}")
return raw_summary
def process_article(article: Dict, language: str) -> Optional[Dict]:
content = article.get('content', '')
if not content:
logger.warning(f"Skipped {article.get('id', 'unknown')} (No content)")
return article
cleaned_content = clean_text(content)
if not should_summarize(cleaned_content):
article['summary'] = cleaned_content
article['summarized'] = False
return article
try:
if language == "hindi":
summary = _summarize_hindi(cleaned_content)
else:
summary = _summarize_english(cleaned_content)
article['summary'] = summary
article['summarized'] = True
article['summary_generated_at'] = datetime.now(timezone.utc).isoformat()
return article
except Exception as e:
logger.error(f"Summarization failed for {article.get('id', 'unknown')}: {e}")
return article
# ─────────────────────────────────────────────
# Public API
# ─────────────────────────────────────────────
def summarize_articles(articles: List[Dict], language: str) -> List[Dict]:
"""
Summarizes a list of articles using standard NLP models for English and mT5/Groq for Hindi.
Does not perform any disk I/O, just returns the summarized in-memory items.
Args:
articles: List of dictionary records containing 'content' keys.
language: 'english' or 'hindi'
Returns:
List of dictionary records with the added 'summary' field.
"""
language = language.lower()
logger.info(f"Starting {language.upper()} Summarization pipeline for {len(articles)} articles.")
if not articles:
return []
processed = []
# Sequential execution to avoid thread-related GPU context errors and Groq rate limits
for idx, article in enumerate(articles, 1):
try:
safe_title = article.get("title", "No Title")[:50].encode(sys.stdout.encoding, errors='replace').decode(sys.stdout.encoding)
except:
safe_title = "Article"
logger.info(f"[{idx}/{len(articles)}] Summarizing: {safe_title}...")
updated_article = process_article(article, language)
processed.append(updated_article)
actual_summaries = sum(1 for a in processed if a.get('summarized', False))
logger.success(f"Summarized {actual_summaries}/{len(articles)} articles.")
return processed