Spaces:

Clearwave48
/

Clearwave-ai

Runtime error

App Files Files Community

Clearwave48 commited on Mar 11

Commit

8fa57b8

verified ·

1 Parent(s): b3b2c31

Create translator.py

Browse files

Files changed (1) hide show

translator.py +249 -0

translator.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Department 3 — Translator
+Primary  : NLLB-200-distilled-1.3B (Meta) — free local
+Fallback : Google Translate (deep-translator)
+FIXES APPLIED:
+  - Added Telugu/Indic sentence ending (।) to sentence splitter regex
+  - Reduced chunk size to 50 words for Indic languages (subword tokenization)
+  - Improved summary: uses position scoring (first + last = most informative)
+    instead of just picking longest sentences (which picked run-ons)
+"""
+import re
+import time
+import logging
+logger = logging.getLogger(__name__)
+NLLB_CODES = {
+    "en": "eng_Latn", "te": "tel_Telu", "hi": "hin_Deva",
+    "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn",
+    "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan",
+    "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn",
+    "ru": "rus_Cyrl",
+}
+# FIX: Indic languages use subword tokenization — fewer words fit in 512 tokens
+INDIC_LANGS    = {"te", "hi", "ta", "kn", "ar"}
+CHUNK_WORDS    = 80   # default for Latin-script languages
+CHUNK_WORDS_INDIC = 50  # reduced for Indic/RTL languages
+MODEL_ID   = "facebook/nllb-200-distilled-1.3B"
+MAX_TOKENS = 512
+class Translator:
+    def __init__(self):
+        self._pipeline    = None
+        self._tokenizer   = None
+        self._model       = None
+        self._nllb_loaded = False
+        print("[Translator] Ready (NLLB loads on first use)")
+    # ══════════════════════════════════════════════════════════════════
+    # PUBLIC — TRANSLATE
+    # ══════════════════════════════════════════════════════════════════
+    def translate(self, text: str, src_lang: str, tgt_lang: str):
+        if not text or not text.strip():
+            return "", "skipped (empty)"
+        if src_lang == tgt_lang:
+            return text, "skipped (same language)"
+        if not self._nllb_loaded:
+            self._init_nllb()
+            self._nllb_loaded = True
+        # FIX: Use smaller chunks for Indic languages
+        max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS
+        chunks    = self._chunk(text, max_words)
+        print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars")
+        if self._pipeline is not None or self._model is not None:
+            try:
+                return self._nllb_chunks(chunks, src_lang, tgt_lang)
+            except Exception as e:
+                logger.warning(f"NLLB failed ({e}), using Google")
+        return self._google_chunks(chunks, src_lang, tgt_lang)
+    # ══════════════════════════════════════════════════════════════════
+    # PUBLIC — SUMMARIZE — FIXED
+    # ══════════════════════════════════════════════════════════════════
+    def summarize(self, text: str, max_sentences: int = 5) -> str:
+        """
+        FIX: Improved extractive summary using position scoring.
+        Old approach: picked longest sentences → grabbed run-ons / filler.
+        New approach: scores by position (first & last = high value) +
+                      length bonus (medium-length sentences preferred).
+        Research basis: TextRank & lead-3 heuristics consistently show
+        that sentence position is a stronger signal than length alone.
+        """
+        try:
+            # FIX: Include Telugu sentence ending (।) in splitter
+            sentences = re.split(r'(?<=[.!?।])\s+', text.strip())
+            sentences = [s.strip() for s in sentences if len(s.split()) > 5]
+            if len(sentences) <= max_sentences:
+                return text
+            n = len(sentences)
+            # Score each sentence: position + length bonus
+            def score(idx, sent):
+                pos_score = 0.0
+                if idx == 0:
+                    pos_score = 1.0    # first sentence = highest value
+                elif idx == n - 1:
+                    pos_score = 0.7    # last sentence = conclusion
+                elif idx <= n * 0.2:
+                    pos_score = 0.6    # early sentences
+                else:
+                    pos_score = 0.3    # middle sentences
+                # Prefer medium-length sentences (not too short, not run-ons)
+                word_count  = len(sent.split())
+                if 10 <= word_count <= 30:
+                    len_bonus = 0.3
+                elif word_count < 10:
+                    len_bonus = 0.0
+                else:
+                    len_bonus = 0.1   # penalize very long run-ons
+                return pos_score + len_bonus
+            scored = sorted(
+                enumerate(sentences),
+                key=lambda x: score(x[0], x[1]),
+                reverse=True
+            )
+            top_indices = sorted([i for i, _ in scored[:max_sentences]])
+            summary     = " ".join(sentences[i] for i in top_indices)
+            return summary.strip()
+        except Exception as e:
+            logger.warning(f"Summarize failed: {e}")
+            return text[:800] + "..."
+    # ══════════════════════════════════════════════════════════════════
+    # CHUNKING — FIXED (Telugu sentence ending added)
+    # ══════════════════════════════════════════════════════════════════
+    def _chunk(self, text, max_words):
+        # FIX: Added । (Devanagari/Telugu danda) to sentence split pattern
+        sentences = re.split(r'(?<=[.!?।])\s+', text.strip())
+        chunks, cur, count = [], [], 0
+        for s in sentences:
+            w = len(s.split())
+            if count + w > max_words and cur:
+                chunks.append(" ".join(cur))
+                cur, count = [], 0
+            cur.append(s)
+            count += w
+        if cur:
+            chunks.append(" ".join(cur))
+        return chunks
+    # ══════════════════════════════════════════════════════════════════
+    # NLLB TRANSLATION
+    # ══════════════════════════════════════════════════════════════════
+    def _nllb_chunks(self, chunks, src_lang, tgt_lang):
+        t0       = time.time()
+        src_code = NLLB_CODES.get(src_lang, "eng_Latn")
+        tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")
+        results  = []
+        for i, chunk in enumerate(chunks):
+            if not chunk.strip():
+                continue
+            try:
+                if self._pipeline is not None:
+                    out = self._pipeline(
+                        chunk,
+                        src_lang=src_code,
+                        tgt_lang=tgt_code,
+                        max_length=MAX_TOKENS,
+                    )
+                    results.append(out[0]["translation_text"])
+                else:
+                    import torch
+                    inputs = self._tokenizer(
+                        chunk, return_tensors="pt",
+                        padding=True, truncation=True,
+                        max_length=MAX_TOKENS,
+                    )
+                    if torch.cuda.is_available():
+                        inputs = {k: v.cuda() for k, v in inputs.items()}
+                    tid = self._tokenizer.convert_tokens_to_ids(tgt_code)
+                    with torch.no_grad():
+                        ids = self._model.generate(
+                            **inputs,
+                            forced_bos_token_id=tid,
+                            max_length=MAX_TOKENS,
+                            num_beams=4,
+                            early_stopping=True,
+                        )
+                    results.append(
+                        self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0])
+            except Exception as e:
+                logger.warning(f"Chunk {i+1} NLLB failed: {e}")
+                results.append(chunk)
+        translated = " ".join(results)
+        logger.info(f"NLLB done in {time.time()-t0:.2f}s")
+        return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)"
+    # ══════════════════════════════════════════════════════════════════
+    # GOOGLE FALLBACK
+    # ══════════════════════════════════════════════════════════════════
+    def _google_chunks(self, chunks, src_lang, tgt_lang):
+        t0 = time.time()
+        try:
+            from deep_translator import GoogleTranslator
+            results = []
+            for chunk in chunks:
+                if not chunk.strip():
+                    continue
+                out = GoogleTranslator(
+                    source=src_lang if src_lang != "auto" else "auto",
+                    target=tgt_lang,
+                ).translate(chunk)
+                results.append(out)
+            full = " ".join(results)
+            logger.info(f"Google done in {time.time()-t0:.2f}s")
+            return full, f"Google Translate ({len(chunks)} chunks)"
+        except Exception as e:
+            logger.error(f"Google failed: {e}")
+            return f"[Translation failed: {e}]", "error"
+    # ══════════════════════════════════════════════════════════════════
+    # NLLB INIT
+    # ══════════════════════════════════════════════════════════════════
+    def _init_nllb(self):
+        try:
+            from transformers import pipeline as hf_pipeline
+            self._pipeline = hf_pipeline(
+                "translation", model=MODEL_ID,
+                device_map="auto", max_length=MAX_TOKENS,
+            )
+            print(f"[Translator] ✅ {MODEL_ID} pipeline ready")
+        except Exception as e:
+            logger.warning(f"Pipeline init failed ({e}), trying manual load")
+            self._init_nllb_manual()
+    def _init_nllb_manual(self):
+        try:
+            from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+            import torch
+            self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+            self._model = AutoModelForSeq2SeqLM.from_pretrained(
+                MODEL_ID,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            )
+            if torch.cuda.is_available():
+                self._model = self._model.cuda()
+            self._model.eval()
+            print(f"[Translator] ✅ {MODEL_ID} manual load ready")
+        except Exception as e:
+            logger.error(f"NLLB manual load failed: {e}")