Spaces:

testingfaces
/

clearwave-ai

Running

App Files Files Community

testingfaces commited on 14 days ago

Commit

e86fdec

verified ·

1 Parent(s): e4cbaa4

Update translator.py

Browse files

Files changed (1) hide show

translator.py +121 -78

translator.py CHANGED Viewed

@@ -1,7 +1,12 @@
 """
 Department 3 - Translator
-Primary  : NLLB-200-distilled-1.3B (Meta) — ✅ UPGRADED from 600M for better accuracy
-Fallback : deep-translator (Google Translate) if NLLB fails
 """
 import time
@@ -25,10 +30,9 @@ NLLB_CODES = {
     "ru": "rus_Cyrl",
 }
-# ✅ UPGRADED: 1.3B is significantly more accurate than 600M,
-#    especially for Telugu, Tamil, Kannada — still runs free on HF CPU
-MODEL_ID   = "facebook/nllb-200-distilled-1.3B"
-MAX_LENGTH = 512
 class Translator:
@@ -37,35 +41,131 @@ class Translator:
         self._tokenizer   = None
         self._model       = None
         self._nllb_loaded = False
-        # ✅ LAZY LOAD: Don't load 2.5GB model on startup
-        # Loads automatically on first translation request instead
         print("[Translator] Ready (NLLB loads on first use)")
-    # ── Public ──────────────────────────────────────────────────────
     def translate(self, text: str, src_lang: str, tgt_lang: str):
-        """
-        Returns (translated_text, method_label).
-        src_lang / tgt_lang are 2-letter codes (en, te, hi, ...).
-        """
         if not text or not text.strip():
             return "", "skipped (empty)"
         if src_lang == tgt_lang:
             return text, "skipped (same language)"
         if not self._nllb_loaded:
             self._init_nllb()
             self._nllb_loaded = True
         if self._pipeline is not None or self._model is not None:
             try:
-                return self._translate_nllb(text, src_lang, tgt_lang)
             except Exception as e:
                 logger.warning(f"[Translator] NLLB failed ({e}), trying Google...")
-        return self._translate_google(text, src_lang, tgt_lang)
-    # ── NLLB-200 ────────────────────────────────────────────────────
     def _init_nllb(self):
         try:
             from transformers import pipeline as hf_pipeline
@@ -75,9 +175,9 @@ class Translator:
                 device_map="auto",
                 max_length=MAX_LENGTH,
             )
-            print(f"[Translator] ✅ {MODEL_ID} loaded via pipeline")
         except Exception as e:
-            logger.warning(f"[Translator] pipeline init failed, trying manual load: {e}")
             self._init_nllb_manual()
     def _init_nllb_manual(self):
@@ -94,62 +194,5 @@ class Translator:
             self._model.eval()
             print(f"[Translator] ✅ {MODEL_ID} loaded manually")
         except Exception as e:
-            logger.error(f"[Translator] NLLB manual load also failed: {e}")
-            self._model = None
-    def _translate_nllb(self, text: str, src_lang: str, tgt_lang: str):
-        t0 = time.time()
-        src_code = NLLB_CODES.get(src_lang, "eng_Latn")
-        tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")
-        if self._pipeline is not None:
-            result     = self._pipeline(
-                text,
-                src_lang=src_code,
-                tgt_lang=tgt_code,
-                max_length=MAX_LENGTH,
-            )
-            translated = result[0]["translation_text"]
-        else:
-            import torch
-            inputs = self._tokenizer(
-                text,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=MAX_LENGTH,
-            )
-            if torch.cuda.is_available():
-                inputs = {k: v.cuda() for k, v in inputs.items()}
-            tgt_lang_id = self._tokenizer.convert_tokens_to_ids(tgt_code)
-            with torch.no_grad():
-                output_ids = self._model.generate(
-                    **inputs,
-                    forced_bos_token_id=tgt_lang_id,
-                    max_length=MAX_LENGTH,
-                    num_beams=4,
-                    early_stopping=True,
-                )
-            translated = self._tokenizer.batch_decode(
-                output_ids, skip_special_tokens=True
-            )[0]
-        elapsed = time.time() - t0
-        logger.info(f"[Translator] NLLB done in {elapsed:.2f}s: {src_code} -> {tgt_code}")
-        return translated, "NLLB-200-distilled-1.3B"
-    # ── Google Translate fallback ───────────────────────────────���────
-    def _translate_google(self, text: str, src_lang: str, tgt_lang: str):
-        t0 = time.time()
-        try:
-            from deep_translator import GoogleTranslator
-            translated = GoogleTranslator(
-                source=src_lang if src_lang != "auto" else "auto",
-                target=tgt_lang,
-            ).translate(text)
-            logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s")
-            return translated, "Google Translate (fallback)"
-        except Exception as e:
-            logger.error(f"[Translator] Google fallback also failed: {e}")
-            return f"[Translation failed: {str(e)}]", "error"

 """
 Department 3 - Translator
+Primary  : NLLB-200-distilled-1.3B (Meta)
+Fallback : deep-translator (Google Translate)
+✅ UPGRADED:
+  - Text chunking for long transcripts (fixes repetition bug)
+  - Splits by sentence, translates in 400-token chunks
+  - Rejoins cleanly into full translation
 """
 import time
     "ru": "rus_Cyrl",
 }
+MODEL_ID      = "facebook/nllb-200-distilled-1.3B"
+MAX_LENGTH    = 512
+CHUNK_WORDS   = 80  # ~400 tokens, safe for NLLB
 class Translator:
         self._tokenizer   = None
         self._model       = None
         self._nllb_loaded = False
         print("[Translator] Ready (NLLB loads on first use)")
+    # ── Public ───────────────────────────────────────────────────────
     def translate(self, text: str, src_lang: str, tgt_lang: str):
         if not text or not text.strip():
             return "", "skipped (empty)"
         if src_lang == tgt_lang:
             return text, "skipped (same language)"
+        # Load NLLB on first use
         if not self._nllb_loaded:
             self._init_nllb()
             self._nllb_loaded = True
+        # Split long text into chunks
+        chunks = self._split_into_chunks(text, CHUNK_WORDS)
+        print(f"[Translator] Translating {len(chunks)} chunks ({len(text)} chars)")
         if self._pipeline is not None or self._model is not None:
             try:
+                return self._translate_chunks_nllb(chunks, src_lang, tgt_lang)
             except Exception as e:
                 logger.warning(f"[Translator] NLLB failed ({e}), trying Google...")
+        return self._translate_chunks_google(chunks, src_lang, tgt_lang)
+    # ── Chunking ─────────────────────────────────────────────────────
+    def _split_into_chunks(self, text: str, max_words: int):
+        """Split text into sentence-aware chunks of max_words words."""
+        # Split by sentence endings
+        import re
+        sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+        chunks  = []
+        current = []
+        count   = 0
+        for sentence in sentences:
+            words = sentence.split()
+            if count + len(words) > max_words and current:
+                chunks.append(" ".join(current))
+                current = []
+                count   = 0
+            current.append(sentence)
+            count += len(words)
+        if current:
+            chunks.append(" ".join(current))
+        return chunks
+    # ── NLLB chunked translation ──────────────────────────────────────
+    def _translate_chunks_nllb(self, chunks, src_lang, tgt_lang):
+        t0       = time.time()
+        results  = []
+        src_code = NLLB_CODES.get(src_lang, "eng_Latn")
+        tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")
+        for i, chunk in enumerate(chunks):
+            if not chunk.strip():
+                continue
+            try:
+                if self._pipeline is not None:
+                    result = self._pipeline(
+                        chunk,
+                        src_lang=src_code,
+                        tgt_lang=tgt_code,
+                        max_length=MAX_LENGTH,
+                    )
+                    results.append(result[0]["translation_text"])
+                else:
+                    import torch
+                    inputs = self._tokenizer(
+                        chunk,
+                        return_tensors="pt",
+                        padding=True,
+                        truncation=True,
+                        max_length=MAX_LENGTH,
+                    )
+                    if torch.cuda.is_available():
+                        inputs = {k: v.cuda() for k, v in inputs.items()}
+                    tgt_lang_id = self._tokenizer.convert_tokens_to_ids(tgt_code)
+                    with torch.no_grad():
+                        output_ids = self._model.generate(
+                            **inputs,
+                            forced_bos_token_id=tgt_lang_id,
+                            max_length=MAX_LENGTH,
+                            num_beams=4,
+                            early_stopping=True,
+                        )
+                    translated = self._tokenizer.batch_decode(
+                        output_ids, skip_special_tokens=True)[0]
+                    results.append(translated)
+            except Exception as e:
+                logger.warning(f"[Translator] Chunk {i+1} failed: {e}")
+                results.append(chunk)  # fallback: keep original
+        translated = " ".join(results)
+        elapsed    = time.time() - t0
+        logger.info(f"[Translator] NLLB done in {elapsed:.2f}s: {src_code}->{tgt_code}")
+        print(f"[Translator] ✅ Done in {elapsed:.2f}s ({len(chunks)} chunks)")
+        return translated, f"NLLB-200-distilled-1.3B ({len(chunks)} chunks)"
+    # ── Google chunked translation ────────────────────────────────────
+    def _translate_chunks_google(self, chunks, src_lang, tgt_lang):
+        t0 = time.time()
+        try:
+            from deep_translator import GoogleTranslator
+            results = []
+            for chunk in chunks:
+                if not chunk.strip():
+                    continue
+                translated = GoogleTranslator(
+                    source=src_lang if src_lang != "auto" else "auto",
+                    target=tgt_lang,
+                ).translate(chunk)
+                results.append(translated)
+            full = " ".join(results)
+            logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s")
+            return full, f"Google Translate ({len(chunks)} chunks)"
+        except Exception as e:
+            logger.error(f"[Translator] Google fallback failed: {e}")
+            return f"[Translation failed: {str(e)}]", "error"
+    # ── NLLB init ────────────────────────────────────────────────────
     def _init_nllb(self):
         try:
             from transformers import pipeline as hf_pipeline
                 device_map="auto",
                 max_length=MAX_LENGTH,
             )
+            print(f"[Translator] ✅ {MODEL_ID} loaded")
         except Exception as e:
+            logger.warning(f"[Translator] Pipeline init failed: {e}, trying manual...")
             self._init_nllb_manual()
     def _init_nllb_manual(self):
             self._model.eval()
             print(f"[Translator] ✅ {MODEL_ID} loaded manually")
         except Exception as e:
+            logger.error(f"[Translator] NLLB manual load failed: {e}")
+            self._model = None