""" Department 3 - Translator Primary : NLLB-200-distilled-1.3B (Meta) Fallback : deep-translator (Google Translate) ✅ UPGRADED: - Text chunking for long transcripts (fixes repetition bug) - Splits by sentence, translates in 400-token chunks - Rejoins cleanly into full translation """ import time import logging logger = logging.getLogger(__name__) NLLB_CODES = { "en": "eng_Latn", "te": "tel_Telu", "hi": "hin_Deva", "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan", "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn", "ru": "rus_Cyrl", } MODEL_ID = "facebook/nllb-200-distilled-1.3B" MAX_LENGTH = 512 CHUNK_WORDS = 80 # ~400 tokens, safe for NLLB class Translator: def __init__(self): self._pipeline = None self._tokenizer = None self._model = None self._nllb_loaded = False print("[Translator] Ready (NLLB loads on first use)") # ── Public ─────────────────────────────────────────────────────── def translate(self, text: str, src_lang: str, tgt_lang: str): if not text or not text.strip(): return "", "skipped (empty)" if src_lang == tgt_lang: return text, "skipped (same language)" # Load NLLB on first use if not self._nllb_loaded: self._init_nllb() self._nllb_loaded = True # Split long text into chunks chunks = self._split_into_chunks(text, CHUNK_WORDS) print(f"[Translator] Translating {len(chunks)} chunks ({len(text)} chars)") if self._pipeline is not None or self._model is not None: try: return self._translate_chunks_nllb(chunks, src_lang, tgt_lang) except Exception as e: logger.warning(f"[Translator] NLLB failed ({e}), trying Google...") return self._translate_chunks_google(chunks, src_lang, tgt_lang) # ── Chunking ───────────────────────────────────────────────────── def _split_into_chunks(self, text: str, max_words: int): """Split text into sentence-aware chunks of max_words words.""" # Split by sentence endings import re sentences = re.split(r'(?<=[.!?])\s+', text.strip()) chunks = [] current = [] count = 0 for sentence in sentences: words = sentence.split() if count + len(words) > max_words and current: chunks.append(" ".join(current)) current = [] count = 0 current.append(sentence) count += len(words) if current: chunks.append(" ".join(current)) return chunks # ── NLLB chunked translation ────────────────────────────────────── def _translate_chunks_nllb(self, chunks, src_lang, tgt_lang): t0 = time.time() results = [] src_code = NLLB_CODES.get(src_lang, "eng_Latn") tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu") for i, chunk in enumerate(chunks): if not chunk.strip(): continue try: if self._pipeline is not None: result = self._pipeline( chunk, src_lang=src_code, tgt_lang=tgt_code, max_length=MAX_LENGTH, ) results.append(result[0]["translation_text"]) else: import torch inputs = self._tokenizer( chunk, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH, ) if torch.cuda.is_available(): inputs = {k: v.cuda() for k, v in inputs.items()} tgt_lang_id = self._tokenizer.convert_tokens_to_ids(tgt_code) with torch.no_grad(): output_ids = self._model.generate( **inputs, forced_bos_token_id=tgt_lang_id, max_length=MAX_LENGTH, num_beams=4, early_stopping=True, ) translated = self._tokenizer.batch_decode( output_ids, skip_special_tokens=True)[0] results.append(translated) except Exception as e: logger.warning(f"[Translator] Chunk {i+1} failed: {e}") results.append(chunk) # fallback: keep original translated = " ".join(results) elapsed = time.time() - t0 logger.info(f"[Translator] NLLB done in {elapsed:.2f}s: {src_code}->{tgt_code}") print(f"[Translator] ✅ Done in {elapsed:.2f}s ({len(chunks)} chunks)") return translated, f"NLLB-200-distilled-1.3B ({len(chunks)} chunks)" # ── Google chunked translation ──────────────────────────────────── def _translate_chunks_google(self, chunks, src_lang, tgt_lang): t0 = time.time() try: from deep_translator import GoogleTranslator results = [] for chunk in chunks: if not chunk.strip(): continue translated = GoogleTranslator( source=src_lang if src_lang != "auto" else "auto", target=tgt_lang, ).translate(chunk) results.append(translated) full = " ".join(results) logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s") return full, f"Google Translate ({len(chunks)} chunks)" except Exception as e: logger.error(f"[Translator] Google fallback failed: {e}") return f"[Translation failed: {str(e)}]", "error" # ── NLLB init ──────────────────────────────────────────────────── def _init_nllb(self): try: from transformers import pipeline as hf_pipeline self._pipeline = hf_pipeline( "translation", model=MODEL_ID, device_map="auto", max_length=MAX_LENGTH, ) print(f"[Translator] ✅ {MODEL_ID} loaded") except Exception as e: logger.warning(f"[Translator] Pipeline init failed: {e}, trying manual...") self._init_nllb_manual() def _init_nllb_manual(self): try: from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) self._model = AutoModelForSeq2SeqLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, ) if torch.cuda.is_available(): self._model = self._model.cuda() self._model.eval() print(f"[Translator] ✅ {MODEL_ID} loaded manually") except Exception as e: logger.error(f"[Translator] NLLB manual load failed: {e}") self._model = None