Spaces:
Running
Running
| """ | |
| Department 3 - Translator | |
| Primary : NLLB-200-distilled-1.3B (Meta) | |
| Fallback : deep-translator (Google Translate) | |
| β UPGRADED: | |
| - Text chunking for long transcripts (fixes repetition bug) | |
| - Splits by sentence, translates in 400-token chunks | |
| - Rejoins cleanly into full translation | |
| """ | |
| import time | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| NLLB_CODES = { | |
| "en": "eng_Latn", | |
| "te": "tel_Telu", | |
| "hi": "hin_Deva", | |
| "ta": "tam_Taml", | |
| "kn": "kan_Knda", | |
| "es": "spa_Latn", | |
| "fr": "fra_Latn", | |
| "de": "deu_Latn", | |
| "ja": "jpn_Jpan", | |
| "zh": "zho_Hans", | |
| "ar": "arb_Arab", | |
| "pt": "por_Latn", | |
| "ru": "rus_Cyrl", | |
| } | |
| MODEL_ID = "facebook/nllb-200-distilled-1.3B" | |
| MAX_LENGTH = 512 | |
| CHUNK_WORDS = 80 # ~400 tokens, safe for NLLB | |
| class Translator: | |
| def __init__(self): | |
| self._pipeline = None | |
| self._tokenizer = None | |
| self._model = None | |
| self._nllb_loaded = False | |
| print("[Translator] Ready (NLLB loads on first use)") | |
| # ββ Public βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def translate(self, text: str, src_lang: str, tgt_lang: str): | |
| if not text or not text.strip(): | |
| return "", "skipped (empty)" | |
| if src_lang == tgt_lang: | |
| return text, "skipped (same language)" | |
| # Load NLLB on first use | |
| if not self._nllb_loaded: | |
| self._init_nllb() | |
| self._nllb_loaded = True | |
| # Split long text into chunks | |
| chunks = self._split_into_chunks(text, CHUNK_WORDS) | |
| print(f"[Translator] Translating {len(chunks)} chunks ({len(text)} chars)") | |
| if self._pipeline is not None or self._model is not None: | |
| try: | |
| return self._translate_chunks_nllb(chunks, src_lang, tgt_lang) | |
| except Exception as e: | |
| logger.warning(f"[Translator] NLLB failed ({e}), trying Google...") | |
| return self._translate_chunks_google(chunks, src_lang, tgt_lang) | |
| # ββ Chunking βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _split_into_chunks(self, text: str, max_words: int): | |
| """Split text into sentence-aware chunks of max_words words.""" | |
| # Split by sentence endings | |
| import re | |
| sentences = re.split(r'(?<=[.!?])\s+', text.strip()) | |
| chunks = [] | |
| current = [] | |
| count = 0 | |
| for sentence in sentences: | |
| words = sentence.split() | |
| if count + len(words) > max_words and current: | |
| chunks.append(" ".join(current)) | |
| current = [] | |
| count = 0 | |
| current.append(sentence) | |
| count += len(words) | |
| if current: | |
| chunks.append(" ".join(current)) | |
| return chunks | |
| # ββ NLLB chunked translation ββββββββββββββββββββββββββββββββββββββ | |
| def _translate_chunks_nllb(self, chunks, src_lang, tgt_lang): | |
| t0 = time.time() | |
| results = [] | |
| src_code = NLLB_CODES.get(src_lang, "eng_Latn") | |
| tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu") | |
| for i, chunk in enumerate(chunks): | |
| if not chunk.strip(): | |
| continue | |
| try: | |
| if self._pipeline is not None: | |
| result = self._pipeline( | |
| chunk, | |
| src_lang=src_code, | |
| tgt_lang=tgt_code, | |
| max_length=MAX_LENGTH, | |
| ) | |
| results.append(result[0]["translation_text"]) | |
| else: | |
| import torch | |
| inputs = self._tokenizer( | |
| chunk, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=MAX_LENGTH, | |
| ) | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.cuda() for k, v in inputs.items()} | |
| tgt_lang_id = self._tokenizer.convert_tokens_to_ids(tgt_code) | |
| with torch.no_grad(): | |
| output_ids = self._model.generate( | |
| **inputs, | |
| forced_bos_token_id=tgt_lang_id, | |
| max_length=MAX_LENGTH, | |
| num_beams=4, | |
| early_stopping=True, | |
| ) | |
| translated = self._tokenizer.batch_decode( | |
| output_ids, skip_special_tokens=True)[0] | |
| results.append(translated) | |
| except Exception as e: | |
| logger.warning(f"[Translator] Chunk {i+1} failed: {e}") | |
| results.append(chunk) # fallback: keep original | |
| translated = " ".join(results) | |
| elapsed = time.time() - t0 | |
| logger.info(f"[Translator] NLLB done in {elapsed:.2f}s: {src_code}->{tgt_code}") | |
| print(f"[Translator] β Done in {elapsed:.2f}s ({len(chunks)} chunks)") | |
| return translated, f"NLLB-200-distilled-1.3B ({len(chunks)} chunks)" | |
| # ββ Google chunked translation ββββββββββββββββββββββββββββββββββββ | |
| def _translate_chunks_google(self, chunks, src_lang, tgt_lang): | |
| t0 = time.time() | |
| try: | |
| from deep_translator import GoogleTranslator | |
| results = [] | |
| for chunk in chunks: | |
| if not chunk.strip(): | |
| continue | |
| translated = GoogleTranslator( | |
| source=src_lang if src_lang != "auto" else "auto", | |
| target=tgt_lang, | |
| ).translate(chunk) | |
| results.append(translated) | |
| full = " ".join(results) | |
| logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s") | |
| return full, f"Google Translate ({len(chunks)} chunks)" | |
| except Exception as e: | |
| logger.error(f"[Translator] Google fallback failed: {e}") | |
| return f"[Translation failed: {str(e)}]", "error" | |
| # ββ NLLB init ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _init_nllb(self): | |
| try: | |
| from transformers import pipeline as hf_pipeline | |
| self._pipeline = hf_pipeline( | |
| "translation", | |
| model=MODEL_ID, | |
| device_map="auto", | |
| max_length=MAX_LENGTH, | |
| ) | |
| print(f"[Translator] β {MODEL_ID} loaded") | |
| except Exception as e: | |
| logger.warning(f"[Translator] Pipeline init failed: {e}, trying manual...") | |
| self._init_nllb_manual() | |
| def _init_nllb_manual(self): | |
| try: | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| self._model = AutoModelForSeq2SeqLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| ) | |
| if torch.cuda.is_available(): | |
| self._model = self._model.cuda() | |
| self._model.eval() | |
| print(f"[Translator] β {MODEL_ID} loaded manually") | |
| except Exception as e: | |
| logger.error(f"[Translator] NLLB manual load failed: {e}") | |
| self._model = None |