Spaces:
Sleeping
Sleeping
| """ | |
| Department 3 β Translator | |
| Primary : NLLB-200-distilled-1.3B (Meta) β free local | |
| Fallback : Google Translate (deep-translator) | |
| FIXES APPLIED: | |
| - Added Telugu/Indic sentence ending (ΰ₯€) to sentence splitter regex | |
| - Reduced chunk size to 50 words for Indic languages (subword tokenization) | |
| - Improved summary: uses position scoring (first + last = most informative) | |
| instead of just picking longest sentences (which picked run-ons) | |
| """ | |
| import re | |
| import time | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| NLLB_CODES = { | |
| "en": "eng_Latn", "te": "tel_Telu", "hi": "hin_Deva", | |
| "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn", | |
| "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan", | |
| "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn", | |
| "ru": "rus_Cyrl", | |
| } | |
| # FIX: Indic languages use subword tokenization β fewer words fit in 512 tokens | |
| INDIC_LANGS = {"te", "hi", "ta", "kn", "ar"} | |
| CHUNK_WORDS = 80 # default for Latin-script languages | |
| CHUNK_WORDS_INDIC = 50 # reduced for Indic/RTL languages | |
| MODEL_ID = "facebook/nllb-200-distilled-1.3B" | |
| MAX_TOKENS = 512 | |
| class Translator: | |
| def __init__(self): | |
| self._pipeline = None | |
| self._tokenizer = None | |
| self._model = None | |
| self._nllb_loaded = False | |
| print("[Translator] Ready (NLLB loads on first use)") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PUBLIC β TRANSLATE | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def translate(self, text: str, src_lang: str, tgt_lang: str): | |
| if not text or not text.strip(): | |
| return "", "skipped (empty)" | |
| if src_lang == tgt_lang: | |
| return text, "skipped (same language)" | |
| if not self._nllb_loaded: | |
| self._init_nllb() | |
| self._nllb_loaded = True | |
| # FIX: Use smaller chunks for Indic languages | |
| max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS | |
| chunks = self._chunk(text, max_words) | |
| print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars") | |
| if self._pipeline is not None or self._model is not None: | |
| try: | |
| return self._nllb_chunks(chunks, src_lang, tgt_lang) | |
| except Exception as e: | |
| logger.warning(f"NLLB failed ({e}), using Google") | |
| return self._google_chunks(chunks, src_lang, tgt_lang) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PUBLIC β SUMMARIZE β FIXED | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def summarize(self, text: str, max_sentences: int = 5) -> str: | |
| """ | |
| FIX: Improved extractive summary using position scoring. | |
| Old approach: picked longest sentences β grabbed run-ons / filler. | |
| New approach: scores by position (first & last = high value) + | |
| length bonus (medium-length sentences preferred). | |
| Research basis: TextRank & lead-3 heuristics consistently show | |
| that sentence position is a stronger signal than length alone. | |
| """ | |
| try: | |
| # FIX: Include Telugu sentence ending (ΰ₯€) in splitter | |
| sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip()) | |
| sentences = [s.strip() for s in sentences if len(s.split()) > 5] | |
| if len(sentences) <= max_sentences: | |
| return text | |
| n = len(sentences) | |
| # Score each sentence: position + length bonus | |
| def score(idx, sent): | |
| pos_score = 0.0 | |
| if idx == 0: | |
| pos_score = 1.0 # first sentence = highest value | |
| elif idx == n - 1: | |
| pos_score = 0.7 # last sentence = conclusion | |
| elif idx <= n * 0.2: | |
| pos_score = 0.6 # early sentences | |
| else: | |
| pos_score = 0.3 # middle sentences | |
| # Prefer medium-length sentences (not too short, not run-ons) | |
| word_count = len(sent.split()) | |
| if 10 <= word_count <= 30: | |
| len_bonus = 0.3 | |
| elif word_count < 10: | |
| len_bonus = 0.0 | |
| else: | |
| len_bonus = 0.1 # penalize very long run-ons | |
| return pos_score + len_bonus | |
| scored = sorted( | |
| enumerate(sentences), | |
| key=lambda x: score(x[0], x[1]), | |
| reverse=True | |
| ) | |
| top_indices = sorted([i for i, _ in scored[:max_sentences]]) | |
| summary = " ".join(sentences[i] for i in top_indices) | |
| return summary.strip() | |
| except Exception as e: | |
| logger.warning(f"Summarize failed: {e}") | |
| return text[:800] + "..." | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CHUNKING β FIXED (Telugu sentence ending added) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _chunk(self, text, max_words): | |
| # FIX: Added ΰ₯€ (Devanagari/Telugu danda) to sentence split pattern | |
| sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip()) | |
| chunks, cur, count = [], [], 0 | |
| for s in sentences: | |
| w = len(s.split()) | |
| if count + w > max_words and cur: | |
| chunks.append(" ".join(cur)) | |
| cur, count = [], 0 | |
| cur.append(s) | |
| count += w | |
| if cur: | |
| chunks.append(" ".join(cur)) | |
| return chunks | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NLLB TRANSLATION | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _nllb_chunks(self, chunks, src_lang, tgt_lang): | |
| t0 = time.time() | |
| src_code = NLLB_CODES.get(src_lang, "eng_Latn") | |
| tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu") | |
| results = [] | |
| for i, chunk in enumerate(chunks): | |
| if not chunk.strip(): | |
| continue | |
| try: | |
| if self._pipeline is not None: | |
| out = self._pipeline( | |
| chunk, | |
| src_lang=src_code, | |
| tgt_lang=tgt_code, | |
| max_length=MAX_TOKENS, | |
| ) | |
| results.append(out[0]["translation_text"]) | |
| else: | |
| import torch | |
| inputs = self._tokenizer( | |
| chunk, return_tensors="pt", | |
| padding=True, truncation=True, | |
| max_length=MAX_TOKENS, | |
| ) | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.cuda() for k, v in inputs.items()} | |
| tid = self._tokenizer.convert_tokens_to_ids(tgt_code) | |
| with torch.no_grad(): | |
| ids = self._model.generate( | |
| **inputs, | |
| forced_bos_token_id=tid, | |
| max_length=MAX_TOKENS, | |
| num_beams=4, | |
| early_stopping=True, | |
| ) | |
| results.append( | |
| self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0]) | |
| except Exception as e: | |
| logger.warning(f"Chunk {i+1} NLLB failed: {e}") | |
| results.append(chunk) | |
| translated = " ".join(results) | |
| logger.info(f"NLLB done in {time.time()-t0:.2f}s") | |
| return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GOOGLE FALLBACK | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _google_chunks(self, chunks, src_lang, tgt_lang): | |
| t0 = time.time() | |
| try: | |
| from deep_translator import GoogleTranslator | |
| results = [] | |
| for chunk in chunks: | |
| if not chunk.strip(): | |
| continue | |
| out = GoogleTranslator( | |
| source=src_lang if src_lang != "auto" else "auto", | |
| target=tgt_lang, | |
| ).translate(chunk) | |
| results.append(out) | |
| full = " ".join(results) | |
| logger.info(f"Google done in {time.time()-t0:.2f}s") | |
| return full, f"Google Translate ({len(chunks)} chunks)" | |
| except Exception as e: | |
| logger.error(f"Google failed: {e}") | |
| return f"[Translation failed: {e}]", "error" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NLLB INIT | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _init_nllb(self): | |
| try: | |
| from transformers import pipeline as hf_pipeline | |
| self._pipeline = hf_pipeline( | |
| "translation", model=MODEL_ID, | |
| device_map="auto", max_length=MAX_TOKENS, | |
| ) | |
| print(f"[Translator] β {MODEL_ID} pipeline ready") | |
| except Exception as e: | |
| logger.warning(f"Pipeline init failed ({e}), trying manual load") | |
| self._init_nllb_manual() | |
| def _init_nllb_manual(self): | |
| try: | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| self._model = AutoModelForSeq2SeqLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| ) | |
| if torch.cuda.is_available(): | |
| self._model = self._model.cuda() | |
| self._model.eval() | |
| print(f"[Translator] β {MODEL_ID} manual load ready") | |
| except Exception as e: | |
| logger.error(f"NLLB manual load failed: {e}") |