Spaces:
Sleeping
Sleeping
| """ | |
| News Summarization Module | |
| Provides a clean, symmetric pipeline function `summarize_articles(articles, language)` | |
| for both English (Transformers) and Hindi (mT5 + Groq). | |
| """ | |
| from typing import List, Dict, Optional | |
| import time | |
| from datetime import datetime, timezone | |
| import sys | |
| from pathlib import Path | |
| # Add backend to path to resolve core modules | |
| sys.path.append(str(Path(__file__).resolve().parent.parent)) | |
| from core.logger import logger | |
| from core.config import config | |
| from summarize_articles.utils import clean_text, should_summarize | |
| from summarize_articles.model import get_summarizer | |
| # For Hindi Summarization | |
| try: | |
| from optimum.onnxruntime import ORTModelForSeq2SeqLM | |
| from transformers import AutoTokenizer | |
| from groq import Groq | |
| except ImportError: | |
| pass | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Lazy Model Loading | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| _english_summarizer = None | |
| _hindi_model = None | |
| _hindi_tokenizer = None | |
| _groq_client = None | |
| def load_english_model(): | |
| global _english_summarizer | |
| if _english_summarizer is None: | |
| logger.info("Loading English Summarization Model...") | |
| _english_summarizer = get_summarizer() | |
| def load_hindi_models(): | |
| global _hindi_model, _hindi_tokenizer, _groq_client | |
| if _hindi_model is None: | |
| model_dir = config.MODELS_DIR / "mt5_onnx" | |
| if not model_dir.exists(): | |
| raise FileNotFoundError(f"mT5 ONNX model missing at {model_dir}. Run: python backend/export_mt5.py") | |
| logger.info("Loading Hindi mT5 ONNX model and tokenizer...") | |
| _hindi_tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
| _hindi_model = ORTModelForSeq2SeqLM.from_pretrained(model_dir) | |
| if _groq_client is None: | |
| if not config.GROQ_API_KEY: | |
| raise ValueError("GROQ_API_KEY is not set in environment or .env.") | |
| logger.info("Initializing Groq Client...") | |
| _groq_client = Groq(api_key=config.GROQ_API_KEY) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Language-Specific Summarization Logic | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def _summarize_english(text: str) -> str: | |
| load_english_model() | |
| return _english_summarizer.summarize(text, max_words=config.MAX_SUMMARY_WORDS, language="english") | |
| def _summarize_hindi(text: str) -> str: | |
| load_hindi_models() | |
| # 1. mT5 Raw Summary | |
| inputs = _hindi_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) | |
| outputs = _hindi_model.generate( | |
| **inputs, | |
| max_length=350, | |
| min_length=120, | |
| num_beams=4, | |
| length_penalty=2.0, | |
| early_stopping=True | |
| ) | |
| raw_summary = _hindi_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # 2. Groq Polish | |
| try: | |
| response = _groq_client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a Hindi news anchor. Rewrite the given summary into a natural, " | |
| "smooth 2-3 sentence broadcast script in Hindi. Use simple words. " | |
| "Write all numbers in Hindi words (e.g. ΰ€¦ΰ€Έ, ΰ€Έΰ€Ύΰ€€). " | |
| "Output ONLY the polished Hindi text, nothing else, no quotes." | |
| ) | |
| }, | |
| {"role": "user", "content": raw_summary} | |
| ], | |
| temperature=0.3, | |
| max_tokens=500, | |
| ) | |
| time.sleep(3) # Rate Limit buffer for Groq free tier | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| logger.error(f"Groq Polish failed: {e}") | |
| return raw_summary | |
| def process_article(article: Dict, language: str) -> Optional[Dict]: | |
| content = article.get('content', '') | |
| if not content: | |
| logger.warning(f"Skipped {article.get('id', 'unknown')} (No content)") | |
| return article | |
| cleaned_content = clean_text(content) | |
| if not should_summarize(cleaned_content): | |
| article['summary'] = cleaned_content | |
| article['summarized'] = False | |
| return article | |
| try: | |
| if language == "hindi": | |
| summary = _summarize_hindi(cleaned_content) | |
| else: | |
| summary = _summarize_english(cleaned_content) | |
| article['summary'] = summary | |
| article['summarized'] = True | |
| article['summary_generated_at'] = datetime.now(timezone.utc).isoformat() | |
| return article | |
| except Exception as e: | |
| logger.error(f"Summarization failed for {article.get('id', 'unknown')}: {e}") | |
| return article | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Public API | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def summarize_articles(articles: List[Dict], language: str) -> List[Dict]: | |
| """ | |
| Summarizes a list of articles using standard NLP models for English and mT5/Groq for Hindi. | |
| Does not perform any disk I/O, just returns the summarized in-memory items. | |
| Args: | |
| articles: List of dictionary records containing 'content' keys. | |
| language: 'english' or 'hindi' | |
| Returns: | |
| List of dictionary records with the added 'summary' field. | |
| """ | |
| language = language.lower() | |
| logger.info(f"Starting {language.upper()} Summarization pipeline for {len(articles)} articles.") | |
| if not articles: | |
| return [] | |
| processed = [] | |
| # Sequential execution to avoid thread-related GPU context errors and Groq rate limits | |
| for idx, article in enumerate(articles, 1): | |
| try: | |
| safe_title = article.get("title", "No Title")[:50].encode(sys.stdout.encoding, errors='replace').decode(sys.stdout.encoding) | |
| except: | |
| safe_title = "Article" | |
| logger.info(f"[{idx}/{len(articles)}] Summarizing: {safe_title}...") | |
| updated_article = process_article(article, language) | |
| processed.append(updated_article) | |
| actual_summaries = sum(1 for a in processed if a.get('summarized', False)) | |
| logger.success(f"Summarized {actual_summaries}/{len(articles)} articles.") | |
| return processed | |