""" Department 3 — Translator Primary : NLLB-200-distilled-1.3B (Meta) — free local Fallback : Google Translate (deep-translator) FIXES APPLIED: - Added Telugu/Indic sentence ending (।) to sentence splitter regex - Reduced chunk size to 50 words for Indic languages (subword tokenization) - Improved summary: uses position scoring (first + last = most informative) instead of just picking longest sentences (which picked run-ons) """ import re import time import logging logger = logging.getLogger(__name__) NLLB_CODES = { "en": "eng_Latn", "te": "tel_Telu", "hi": "hin_Deva", "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan", "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn", "ru": "rus_Cyrl", } # FIX: Indic languages use subword tokenization — fewer words fit in 512 tokens INDIC_LANGS = {"te", "hi", "ta", "kn", "ar"} CHUNK_WORDS = 80 # default for Latin-script languages CHUNK_WORDS_INDIC = 50 # reduced for Indic/RTL languages MODEL_ID = "facebook/nllb-200-distilled-1.3B" MAX_TOKENS = 512 class Translator: def __init__(self): self._pipeline = None self._tokenizer = None self._model = None self._nllb_loaded = False print("[Translator] Ready (NLLB loads on first use)") # ══════════════════════════════════════════════════════════════════ # PUBLIC — TRANSLATE # ══════════════════════════════════════════════════════════════════ def translate(self, text: str, src_lang: str, tgt_lang: str): if not text or not text.strip(): return "", "skipped (empty)" if src_lang == tgt_lang: return text, "skipped (same language)" if not self._nllb_loaded: self._init_nllb() self._nllb_loaded = True # FIX: Use smaller chunks for Indic languages max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS chunks = self._chunk(text, max_words) print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars") if self._pipeline is not None or self._model is not None: try: return self._nllb_chunks(chunks, src_lang, tgt_lang) except Exception as e: logger.warning(f"NLLB failed ({e}), using Google") return self._google_chunks(chunks, src_lang, tgt_lang) # ══════════════════════════════════════════════════════════════════ # PUBLIC — SUMMARIZE — FIXED # ══════════════════════════════════════════════════════════════════ def summarize(self, text: str, max_sentences: int = 5) -> str: """ FIX: Improved extractive summary using position scoring. Old approach: picked longest sentences → grabbed run-ons / filler. New approach: scores by position (first & last = high value) + length bonus (medium-length sentences preferred). Research basis: TextRank & lead-3 heuristics consistently show that sentence position is a stronger signal than length alone. """ try: # FIX: Include Telugu sentence ending (।) in splitter sentences = re.split(r'(?<=[.!?।])\s+', text.strip()) sentences = [s.strip() for s in sentences if len(s.split()) > 5] if len(sentences) <= max_sentences: return text n = len(sentences) # Score each sentence: position + length bonus def score(idx, sent): pos_score = 0.0 if idx == 0: pos_score = 1.0 # first sentence = highest value elif idx == n - 1: pos_score = 0.7 # last sentence = conclusion elif idx <= n * 0.2: pos_score = 0.6 # early sentences else: pos_score = 0.3 # middle sentences # Prefer medium-length sentences (not too short, not run-ons) word_count = len(sent.split()) if 10 <= word_count <= 30: len_bonus = 0.3 elif word_count < 10: len_bonus = 0.0 else: len_bonus = 0.1 # penalize very long run-ons return pos_score + len_bonus scored = sorted( enumerate(sentences), key=lambda x: score(x[0], x[1]), reverse=True ) top_indices = sorted([i for i, _ in scored[:max_sentences]]) summary = " ".join(sentences[i] for i in top_indices) return summary.strip() except Exception as e: logger.warning(f"Summarize failed: {e}") return text[:800] + "..." # ══════════════════════════════════════════════════════════════════ # CHUNKING — FIXED (Telugu sentence ending added) # ══════════════════════════════════════════════════════════════════ def _chunk(self, text, max_words): # FIX: Added । (Devanagari/Telugu danda) to sentence split pattern sentences = re.split(r'(?<=[.!?।])\s+', text.strip()) chunks, cur, count = [], [], 0 for s in sentences: w = len(s.split()) if count + w > max_words and cur: chunks.append(" ".join(cur)) cur, count = [], 0 cur.append(s) count += w if cur: chunks.append(" ".join(cur)) return chunks # ══════════════════════════════════════════════════════════════════ # NLLB TRANSLATION # ══════════════════════════════════════════════════════════════════ def _nllb_chunks(self, chunks, src_lang, tgt_lang): t0 = time.time() src_code = NLLB_CODES.get(src_lang, "eng_Latn") tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu") results = [] for i, chunk in enumerate(chunks): if not chunk.strip(): continue try: if self._pipeline is not None: out = self._pipeline( chunk, src_lang=src_code, tgt_lang=tgt_code, max_length=MAX_TOKENS, ) results.append(out[0]["translation_text"]) else: import torch inputs = self._tokenizer( chunk, return_tensors="pt", padding=True, truncation=True, max_length=MAX_TOKENS, ) if torch.cuda.is_available(): inputs = {k: v.cuda() for k, v in inputs.items()} tid = self._tokenizer.convert_tokens_to_ids(tgt_code) with torch.no_grad(): ids = self._model.generate( **inputs, forced_bos_token_id=tid, max_length=MAX_TOKENS, num_beams=4, early_stopping=True, ) results.append( self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0]) except Exception as e: logger.warning(f"Chunk {i+1} NLLB failed: {e}") results.append(chunk) translated = " ".join(results) logger.info(f"NLLB done in {time.time()-t0:.2f}s") return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)" # ══════════════════════════════════════════════════════════════════ # GOOGLE FALLBACK # ══════════════════════════════════════════════════════════════════ def _google_chunks(self, chunks, src_lang, tgt_lang): t0 = time.time() try: from deep_translator import GoogleTranslator results = [] for chunk in chunks: if not chunk.strip(): continue out = GoogleTranslator( source=src_lang if src_lang != "auto" else "auto", target=tgt_lang, ).translate(chunk) results.append(out) full = " ".join(results) logger.info(f"Google done in {time.time()-t0:.2f}s") return full, f"Google Translate ({len(chunks)} chunks)" except Exception as e: logger.error(f"Google failed: {e}") return f"[Translation failed: {e}]", "error" # ══════════════════════════════════════════════════════════════════ # NLLB INIT # ══════════════════════════════════════════════════════════════════ def _init_nllb(self): try: from transformers import pipeline as hf_pipeline self._pipeline = hf_pipeline( "translation", model=MODEL_ID, device_map="auto", max_length=MAX_TOKENS, ) print(f"[Translator] ✅ {MODEL_ID} pipeline ready") except Exception as e: logger.warning(f"Pipeline init failed ({e}), trying manual load") self._init_nllb_manual() def _init_nllb_manual(self): try: from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) self._model = AutoModelForSeq2SeqLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, ) if torch.cuda.is_available(): self._model = self._model.cuda() self._model.eval() print(f"[Translator] ✅ {MODEL_ID} manual load ready") except Exception as e: logger.error(f"NLLB manual load failed: {e}")