Spaces:

Clearwave48
/

clearwave-api

Running

App Files Files Community

Clearwave48 commited on 24 days ago

Commit

47ab527

verified ·

1 Parent(s): 36576b6

Update translator.py

Browse files

Files changed (1) hide show

translator.py +110 -47

translator.py CHANGED Viewed

@@ -1,13 +1,28 @@
 """
-Department 3 — Translator
 Primary  : NLLB-200-distilled-1.3B (Meta) — free local
 Fallback : Google Translate (deep-translator)
-FIXES APPLIED:
   - Added Telugu/Indic sentence ending (।) to sentence splitter regex
   - Reduced chunk size to 50 words for Indic languages (subword tokenization)
   - Improved summary: uses position scoring (first + last = most informative)
     instead of just picking longest sentences (which picked run-ons)
 """
 import re
@@ -21,17 +36,23 @@ NLLB_CODES = {
     "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn",
     "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan",
     "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn",
-    "ru": "rus_Cyrl",
 }
-# FIX: Indic languages use subword tokenization — fewer words fit in 512 tokens
-INDIC_LANGS    = {"te", "hi", "ta", "kn", "ar"}
-CHUNK_WORDS    = 80   # default for Latin-script languages
-CHUNK_WORDS_INDIC = 50  # reduced for Indic/RTL languages
 MODEL_ID   = "facebook/nllb-200-distilled-1.3B"
 MAX_TOKENS = 512
 class Translator:
     def __init__(self):
@@ -45,6 +66,12 @@ class Translator:
     # PUBLIC — TRANSLATE
     # ══════════════════════════════════════════════════════════════════
     def translate(self, text: str, src_lang: str, tgt_lang: str):
         if not text or not text.strip():
             return "", "skipped (empty)"
         if src_lang == tgt_lang:
@@ -54,46 +81,62 @@ class Translator:
             self._init_nllb()
             self._nllb_loaded = True
-        # FIX: Use smaller chunks for Indic languages
         max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS
         chunks    = self._chunk(text, max_words)
         print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars")
-        if self._pipeline is not None or self._model is not None:
-            try:
-                return self._nllb_chunks(chunks, src_lang, tgt_lang)
-            except Exception as e:
-                logger.warning(f"NLLB failed ({e}), using Google")
-        return self._google_chunks(chunks, src_lang, tgt_lang)
     # ══════════════════════════════════════════════════════════════════
-    # PUBLIC — SUMMARIZE — FIXED
     # ══════════════════════════════════════════════════════════════════
     def summarize(self, text: str, max_sentences: int = 5) -> str:
         """
-        FIX: Improved extractive summary using position scoring.
-        Old approach: picked longest sentences → grabbed run-ons / filler.
-        New approach: scores by position (first & last = high value) +
-                      length bonus (medium-length sentences preferred).
-        Research basis: TextRank & lead-3 heuristics consistently show
-        that sentence position is a stronger signal than length alone.
         """
         try:
-            # FIX: Include Telugu sentence ending (।) in splitter
             sentences = re.split(r'(?<=[.!?।])\s+', text.strip())
             sentences = [s.strip() for s in sentences if len(s.split()) > 5]
             if len(sentences) <= max_sentences:
                 return text
             n = len(sentences)
-            # Score each sentence: position + length bonus
             def score(idx, sent):
-                pos_score = 0.0
                 if idx == 0:
                     pos_score = 1.0    # first sentence = highest value
                 elif idx == n - 1:
@@ -103,35 +146,34 @@ class Translator:
                 else:
                     pos_score = 0.3    # middle sentences
-                # Prefer medium-length sentences (not too short, not run-ons)
-                word_count  = len(sent.split())
                 if 10 <= word_count <= 30:
-                    len_bonus = 0.3
                 elif word_count < 10:
-                    len_bonus = 0.0
                 else:
-                    len_bonus = 0.1   # penalize very long run-ons
                 return pos_score + len_bonus
-            scored = sorted(
-                enumerate(sentences),
-                key=lambda x: score(x[0], x[1]),
-                reverse=True
-            )
             top_indices = sorted([i for i, _ in scored[:max_sentences]])
             summary     = " ".join(sentences[i] for i in top_indices)
             return summary.strip()
         except Exception as e:
-            logger.warning(f"Summarize failed: {e}")
-            return text[:800] + "..."
     # ══════════════════════════════════════════════════════════════════
-    # CHUNKING — FIXED (Telugu sentence ending added)
     # ══════════════════════════════════════════════════════════════════
     def _chunk(self, text, max_words):
-        # FIX: Added । (Devanagari/Telugu danda) to sentence split pattern
         sentences = re.split(r'(?<=[.!?।])\s+', text.strip())
         chunks, cur, count = [], [], 0
         for s in sentences:
@@ -185,13 +227,14 @@ class Translator:
                             early_stopping=True,
                         )
                     results.append(
-                        self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0])
             except Exception as e:
-                logger.warning(f"Chunk {i+1} NLLB failed: {e}")
-                results.append(chunk)
         translated = " ".join(results)
-        logger.info(f"NLLB done in {time.time()-t0:.2f}s")
         return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)"
     # ══════════════════════════════════════════════════════════════════
@@ -211,10 +254,10 @@ class Translator:
                 ).translate(chunk)
                 results.append(out)
             full = " ".join(results)
-            logger.info(f"Google done in {time.time()-t0:.2f}s")
             return full, f"Google Translate ({len(chunks)} chunks)"
         except Exception as e:
-            logger.error(f"Google failed: {e}")
             return f"[Translation failed: {e}]", "error"
     # ══════════════════════════════════════════════════════════════════
@@ -229,7 +272,7 @@ class Translator:
             )
             print(f"[Translator] ✅ {MODEL_ID} pipeline ready")
         except Exception as e:
-            logger.warning(f"Pipeline init failed ({e}), trying manual load")
             self._init_nllb_manual()
     def _init_nllb_manual(self):
@@ -237,7 +280,7 @@ class Translator:
             from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
             import torch
             self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-            self._model = AutoModelForSeq2SeqLM.from_pretrained(
                 MODEL_ID,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             )
@@ -246,4 +289,24 @@ class Translator:
             self._model.eval()
             print(f"[Translator] ✅ {MODEL_ID} manual load ready")
         except Exception as e:
-            logger.error(f"NLLB manual load failed: {e}")

 """
+ClearWave — Translator
+=======================
 Primary  : NLLB-200-distilled-1.3B (Meta) — free local
 Fallback : Google Translate (deep-translator)
+FIXES APPLIED (original):
   - Added Telugu/Indic sentence ending (।) to sentence splitter regex
   - Reduced chunk size to 50 words for Indic languages (subword tokenization)
   - Improved summary: uses position scoring (first + last = most informative)
     instead of just picking longest sentences (which picked run-ons)
+BUGS FIXED (v2):
+  [BUG-5] NLLB silently skipped with no log when both _pipeline and _model
+          are None after failed init → impossible to diagnose in production
+          → Fix: explicit warning log before falling through to Google
+  [BUG-6] Unknown src_lang codes from transcriber (e.g. "be" for Bengali
+          due to _norm() fallback) silently defaulted to "eng_Latn" in
+          NLLB_CODES.get(), causing mistranslation with no warning
+          → Fix: warn explicitly when src_lang or tgt_lang not in NLLB_CODES
+  [BUG-9] summarize() fallback truncated at hard char index 800, cutting
+          mid-sentence and producing incomplete output
+          → Fix: truncate at last sentence boundary (last '.' before limit)
 """
 import re
     "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn",
     "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan",
     "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn",
+    "ru": "rus_Cyrl", "it": "ita_Latn", "nl": "nld_Latn",
+    "pl": "pol_Latn", "sv": "swe_Latn", "tr": "tur_Latn",
+    "bn": "ben_Beng", "ur": "urd_Arab", "ko": "kor_Hang",
+    "vi": "vie_Latn", "ms": "zsm_Latn", "id": "ind_Latn",
 }
+# Indic/RTL languages use subword tokenization — fewer words fit in 512 tokens
+INDIC_LANGS       = {"te", "hi", "ta", "kn", "ar", "bn", "ur"}
+CHUNK_WORDS       = 80   # default for Latin-script languages
+CHUNK_WORDS_INDIC = 50   # reduced for Indic/RTL languages
 MODEL_ID   = "facebook/nllb-200-distilled-1.3B"
 MAX_TOKENS = 512
+# Hard char limit for summarize() fallback truncation
+SUMMARY_FALLBACK_CHARS = 800
 class Translator:
     def __init__(self):
     # PUBLIC — TRANSLATE
     # ══════════════════════════════════════════════════════════════════
     def translate(self, text: str, src_lang: str, tgt_lang: str):
+        """
+        Returns (translated_text, method_label).
+        BUG-6 FIX: warns when src_lang or tgt_lang is not in NLLB_CODES so
+        mistranslation is visible in logs rather than silently defaulting.
+        """
         if not text or not text.strip():
             return "", "skipped (empty)"
         if src_lang == tgt_lang:
             self._init_nllb()
             self._nllb_loaded = True
+        # BUG-6 FIX: warn on unknown language codes before translation attempt
+        if src_lang not in NLLB_CODES:
+            logger.warning(
+                f"[Translator] src_lang '{src_lang}' not in NLLB_CODES — "
+                f"will default to eng_Latn. Add it to NLLB_CODES if incorrect."
+            )
+        if tgt_lang not in NLLB_CODES:
+            logger.warning(
+                f"[Translator] tgt_lang '{tgt_lang}' not in NLLB_CODES — "
+                f"will default to tel_Telu. Add it to NLLB_CODES if incorrect."
+            )
         max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS
         chunks    = self._chunk(text, max_words)
         print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars")
+        # BUG-5 FIX: explicit log when NLLB is unavailable, not silent skip
+        if self._pipeline is None and self._model is None:
+            logger.warning(
+                "[Translator] NLLB not loaded (init failed) — using Google Translate directly"
+            )
+            return self._google_chunks(chunks, src_lang, tgt_lang)
+        try:
+            return self._nllb_chunks(chunks, src_lang, tgt_lang)
+        except Exception as e:
+            logger.warning(f"[Translator] NLLB failed ({e}) — falling back to Google Translate")
+            return self._google_chunks(chunks, src_lang, tgt_lang)
     # ══════════════════════════════════════════════════════════════════
+    # PUBLIC — SUMMARIZE
     # ══════════════════════════════════════════════════════════════════
     def summarize(self, text: str, max_sentences: int = 5) -> str:
         """
+        Extractive summary using position scoring.
+        Scores by position (first & last = high value) + length bonus
+        (medium-length sentences preferred over run-ons).
+        BUG-9 FIX: fallback truncation now cuts at last sentence boundary
+        instead of hard char index, preventing incomplete mid-sentence output.
         """
         try:
+            # Include Telugu/Indic sentence ending (।) in splitter
             sentences = re.split(r'(?<=[.!?।])\s+', text.strip())
             sentences = [s.strip() for s in sentences if len(s.split()) > 5]
+            if not sentences:
+                return text
             if len(sentences) <= max_sentences:
                 return text
             n = len(sentences)
             def score(idx, sent):
                 if idx == 0:
                     pos_score = 1.0    # first sentence = highest value
                 elif idx == n - 1:
                 else:
                     pos_score = 0.3    # middle sentences
+                word_count = len(sent.split())
                 if 10 <= word_count <= 30:
+                    len_bonus = 0.3    # ideal length
                 elif word_count < 10:
+                    len_bonus = 0.0    # too short
                 else:
+                    len_bonus = 0.1    # penalise run-ons
                 return pos_score + len_bonus
+            scored      = sorted(enumerate(sentences), key=lambda x: score(x[0], x[1]), reverse=True)
             top_indices = sorted([i for i, _ in scored[:max_sentences]])
             summary     = " ".join(sentences[i] for i in top_indices)
             return summary.strip()
         except Exception as e:
+            logger.warning(f"[Translator] Summarize failed: {e}")
+            # BUG-9 FIX: truncate at last sentence boundary, not hard char index
+            return self._safe_truncate(text, SUMMARY_FALLBACK_CHARS)
     # ══════════════════════════════════════════════════════════════════
+    # CHUNKING
     # ══════════════════════════════════════════════════════════════════
     def _chunk(self, text, max_words):
+        """
+        Split text into word-count-bounded chunks, respecting sentence
+        boundaries where possible. Handles Indic danda (।) as sentence end.
+        """
         sentences = re.split(r'(?<=[.!?।])\s+', text.strip())
         chunks, cur, count = [], [], 0
         for s in sentences:
                             early_stopping=True,
                         )
                     results.append(
+                        self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
+                    )
             except Exception as e:
+                logger.warning(f"[Translator] Chunk {i+1} NLLB failed: {e} — keeping original")
+                results.append(chunk)   # degrade gracefully per-chunk
         translated = " ".join(results)
+        logger.info(f"[Translator] NLLB done in {time.time()-t0:.2f}s")
         return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)"
     # ══════════════════════════════════════════════════════════════════
                 ).translate(chunk)
                 results.append(out)
             full = " ".join(results)
+            logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s")
             return full, f"Google Translate ({len(chunks)} chunks)"
         except Exception as e:
+            logger.error(f"[Translator] Google failed: {e}")
             return f"[Translation failed: {e}]", "error"
     # ══════════════════════════════════════════════════════════════════
             )
             print(f"[Translator] ✅ {MODEL_ID} pipeline ready")
         except Exception as e:
+            logger.warning(f"[Translator] Pipeline init failed ({e}), trying manual load")
             self._init_nllb_manual()
     def _init_nllb_manual(self):
             from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
             import torch
             self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+            self._model     = AutoModelForSeq2SeqLM.from_pretrained(
                 MODEL_ID,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             )
             self._model.eval()
             print(f"[Translator] ✅ {MODEL_ID} manual load ready")
         except Exception as e:
+            logger.error(f"[Translator] NLLB manual load also failed: {e}")
+            # Both init paths exhausted — _pipeline and _model remain None.
+            # translate() will detect this and route directly to Google.
+    # ══════════════════════════════════════════════════════════════════
+    # HELPERS
+    # ══════════════════════════════════════════════════════════════════
+    @staticmethod
+    def _safe_truncate(text: str, max_chars: int) -> str:
+        """
+        BUG-9 FIX: Truncate text at the last sentence boundary within
+        max_chars, avoiding mid-sentence cuts. Falls back to hard truncation
+        only if no sentence boundary exists within the limit.
+        """
+        if len(text) <= max_chars:
+            return text
+        window      = text[:max_chars]
+        last_period = max(window.rfind('.'), window.rfind('!'), window.rfind('?'))
+        if last_period > max_chars * 0.5:   # boundary found in reasonable range
+            return window[:last_period + 1]
+        return window + "..."