Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Hindi Article Summarizer | |
| ========================= | |
| Summarizes Hindi news articles using a two-stage approach: | |
| Stage 1: Offline summarization with mT5_multilingual_XLSum (ONNX) | |
| Stage 2: Online polishing with Groq Llama 3.3 70B API | |
| This script handles ONLY the summarization part. TTS is handled separately | |
| by backend/text_to_speech/hindi_tts.py. Both are orchestrated by main.py. | |
| Prerequisites: | |
| - mT5 ONNX model at models/mt5_onnx/ (run: python backend/models/export_mt5.py) | |
| - GROQ_API_KEY in .env (free at https://console.groq.com) | |
| Usage: | |
| python backend/summarization/hindi_summary.py --file "articles/hindi/categories/sports/6_mar_7_00_pm.json" | |
| python backend/summarization/hindi_summary.py --file "..." --no-dedup | |
| """ | |
| import sys | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| import os | |
| import json | |
| import time | |
| import argparse | |
| from pathlib import Path | |
| from datetime import datetime | |
| from dotenv import load_dotenv | |
| # Load .env from project root | |
| load_dotenv(Path(__file__).parent.parent.parent / '.env') | |
| # Suppress unnecessary warnings | |
| import logging | |
| import warnings | |
| warnings.filterwarnings('ignore', category=UserWarning) | |
| warnings.filterwarnings('ignore', category=FutureWarning) | |
| logging.getLogger("httpx").setLevel(logging.WARNING) | |
| try: | |
| from optimum.onnxruntime import ORTModelForSeq2SeqLM | |
| from transformers import AutoTokenizer | |
| except ImportError: | |
| print("\nError: Required packages not installed.") | |
| print("Please install optimum[onnxruntime] and transformers") | |
| sys.exit(1) | |
| try: | |
| from groq import Groq | |
| except ImportError: | |
| print("\nError: 'groq' package is not installed.") | |
| print("Run: pip install groq") | |
| sys.exit(1) | |
| # Add project root to path | |
| sys.path.append(str(Path(__file__).parent.parent.parent)) | |
| from backend.common.colors import Colors, Log | |
| from backend.common.paths import get_project_root, get_timestamp_folder | |
| try: | |
| from backend.utils.db_utils import DatabaseManager | |
| HAS_DB = True | |
| except ImportError: | |
| HAS_DB = False | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Output Layout Helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def resolve_output_layout(input_file: Path, category_name: str): | |
| """Determine output folder structure from input path. | |
| Preserves the incoming folder shape: | |
| - categories/<category> | |
| - search_queries/<query> | |
| """ | |
| parts = input_file.parts | |
| parent_folder = "categories" | |
| folder_name = category_name | |
| if "search_queries" in parts: | |
| idx = parts.index("search_queries") | |
| parent_folder = "search_queries" | |
| if idx + 1 < len(parts): | |
| folder_name = parts[idx + 1] | |
| elif "categories" in parts: | |
| idx = parts.index("categories") | |
| if idx + 1 < len(parts): | |
| folder_name = parts[idx + 1] | |
| return parent_folder, folder_name | |
| def build_output_dir(parent_folder: str, folder_name: str): | |
| """Build and create the summarized output directory. | |
| Returns: summarized-articles/hindi/{categories|search_queries}/<folder_name>/ | |
| """ | |
| root = get_project_root() | |
| summ_dir = root / "summarized-articles" / "hindi" / parent_folder / folder_name | |
| summ_dir.mkdir(parents=True, exist_ok=True) | |
| return summ_dir | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # AI Functions | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def summarize_with_mt5(text: str, model, tokenizer) -> str: | |
| """Run text through offline mT5_multilingual_XLSum ONNX model. | |
| Generates a raw Hindi summary. This is stage 1 β output is rough | |
| and gets polished by Groq in stage 2. | |
| """ | |
| inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) | |
| outputs = model.generate( | |
| **inputs, | |
| max_length=350, | |
| min_length=120, | |
| num_beams=4, | |
| length_penalty=2.0, | |
| early_stopping=True | |
| ) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| def polish_with_groq(api_key: str, raw_summary: str) -> str: | |
| """Polish the raw mT5 summary via Groq API using Llama 3.3 70B. | |
| Rewrites the summary into a natural, smooth Hindi broadcast script. | |
| Falls back to raw summary if the API call fails. | |
| """ | |
| try: | |
| client = Groq(api_key=api_key) | |
| response = client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a Hindi news anchor. Rewrite the given summary into a natural, " | |
| "smooth 2-3 sentence broadcast script in Hindi. Use simple words. " | |
| "Write all numbers in Hindi words (e.g. ΰ€¦ΰ€Έ, ΰ€Έΰ€Ύΰ€€). " | |
| "Output ONLY the polished Hindi text, nothing else, no quotes." | |
| ) | |
| }, | |
| {"role": "user", "content": raw_summary} | |
| ], | |
| temperature=0.3, | |
| max_tokens=500, | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| print(f" {Colors.RED}X Groq Polish failed: {e}{Colors.RESET}") | |
| return raw_summary | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Hindi News Summarizer (mT5 ONNX + Groq)") | |
| parser.add_argument("--file", "-f", required=True, help="Path to the scraped JSON file") | |
| parser.add_argument("--no-dedup", action="store_true", help="Skip Supabase deduplication registry check") | |
| args = parser.parse_args() | |
| input_file = Path(args.file) | |
| if not input_file.exists(): | |
| Log.error(f"File not found: {args.file}") | |
| sys.exit(1) | |
| with open(input_file, 'r', encoding='utf-8') as f: | |
| articles = json.load(f) | |
| if not articles: | |
| Log.warning(f"No articles found in {input_file.name}") | |
| sys.exit(0) | |
| category = articles[0].get("category", "unknown").replace(" ", "_").lower() | |
| parent_folder, folder_name = resolve_output_layout(input_file, category) | |
| # Ensure Groq Key exists | |
| api_key = os.environ.get("GROQ_API_KEY") | |
| if not api_key: | |
| Log.error("GROQ_API_KEY environment variable not found.") | |
| print("Please set it in .env or as an environment variable.") | |
| sys.exit(1) | |
| # Load ONNX Model | |
| model_dir = get_project_root() / "models" / "mt5_onnx" | |
| if not model_dir.exists(): | |
| Log.error(f"mT5 model directory not found at {model_dir}") | |
| print(f"Run: python backend/models/export_mt5.py") | |
| sys.exit(1) | |
| print(f"\n{Colors.CYAN}Loading mT5 ONNX model and tokenizer...{Colors.RESET}", end=" ", flush=True) | |
| tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
| model = ORTModelForSeq2SeqLM.from_pretrained(model_dir) | |
| print(f"{Colors.GREEN}V Loaded{Colors.RESET}\n") | |
| # Set up directories | |
| timestamp_folder = get_timestamp_folder() | |
| summ_dir = build_output_dir(parent_folder, folder_name) | |
| # Filter deduplicated articles | |
| total_scraped = len(articles) | |
| already_processed_count = 0 | |
| if HAS_DB and not getattr(args, "no_dedup", False): | |
| db = DatabaseManager() | |
| article_ids = [a.get("id") for a in articles if a.get("id")] | |
| existing_ids = db.check_registry(article_ids) | |
| if existing_ids: | |
| already_processed_count = len(existing_ids) | |
| articles = [a for a in articles if a.get("id") not in existing_ids] | |
| total_articles = len(articles) | |
| print(f"\n{Colors.BOLD}{Colors.CYAN}--- Processing Funnel ---{Colors.RESET}") | |
| print(f"Total scraped : {total_scraped}") | |
| if getattr(args, "no_dedup", False): | |
| print(f"Deduplication : {Colors.YELLOW}DISABLED (--no-dedup){Colors.RESET}") | |
| else: | |
| print(f"Already processed : {already_processed_count}") | |
| print(f"New articles to do: {Colors.GREEN}{total_articles}{Colors.RESET}") | |
| print(f"{Colors.BOLD}{Colors.CYAN}-------------------------{Colors.RESET}\n") | |
| if total_articles == 0: | |
| Log.info("All articles in this batch have already been processed. Exiting.") | |
| sys.exit(0) | |
| processed_articles = [] | |
| for idx, article in enumerate(articles, 1): | |
| article_id = article.get("id", f"unknown_{idx}") | |
| title = article.get("title", "No Title") | |
| content = article.get("content", "") | |
| try: | |
| safe_title = title[:50].encode(sys.stdout.encoding, errors='replace').decode(sys.stdout.encoding) | |
| except Exception: | |
| safe_title = "Hindi Article" | |
| print(f"{Colors.BOLD}[{idx}/{total_articles}] Article: {Colors.CYAN}{safe_title}...{Colors.RESET}") | |
| if not content: | |
| print(f" {Colors.YELLOW}β Skipped (No content){Colors.RESET}\n") | |
| continue | |
| # Stage 1: mT5 Summarization | |
| print(f" β Summarizing with mT5 offline...", end=" ", flush=True) | |
| raw_summary = summarize_with_mt5(content, model, tokenizer) | |
| print(f"{Colors.GREEN}V{Colors.RESET}") | |
| # Stage 2: Groq Polishing | |
| print(f" β Polishing with Groq (Llama 3.3)...", end=" ", flush=True) | |
| polished_summary = polish_with_groq(api_key, raw_summary) | |
| print(f"{Colors.GREEN}V{Colors.RESET}") | |
| # Respect API rate limits | |
| time.sleep(3) | |
| # Update article | |
| article['raw_mt5_summary'] = raw_summary | |
| article['summary'] = polished_summary | |
| article['summarized'] = True | |
| article['summary_generated_at'] = datetime.now().isoformat() | |
| processed_articles.append(article) | |
| print("-" * 60) | |
| # Save Summarized JSON | |
| output_json_path = summ_dir / f"{timestamp_folder}.json" | |
| with open(output_json_path, 'w', encoding='utf-8') as f: | |
| json.dump(processed_articles, f, indent=2, ensure_ascii=False) | |
| print(f"\n{Colors.GREEN}{Colors.BOLD}Hindi Summarization Complete!{Colors.RESET}") | |
| print(f"Summarized JSON: {Colors.CYAN}{output_json_path.relative_to(get_project_root())}{Colors.RESET}\n") | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| except KeyboardInterrupt: | |
| print(f"\n\n{Colors.YELLOW}Summarization cancelled by user{Colors.RESET}\n") | |
| sys.exit(0) | |
| except Exception as e: | |
| Log.error(f"Critical error: {str(e)}") | |
| raise | |