File size: 11,600 Bytes
9716505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
"""
Department 3 β€” Translator
Primary  : NLLB-200-distilled-1.3B (Meta) β€” free local
Fallback : Google Translate (deep-translator)

FIXES APPLIED:
  - Added Telugu/Indic sentence ending (ΰ₯€) to sentence splitter regex
  - Reduced chunk size to 50 words for Indic languages (subword tokenization)
  - Improved summary: uses position scoring (first + last = most informative)
    instead of just picking longest sentences (which picked run-ons)
"""

import re
import time
import logging

logger = logging.getLogger(__name__)

NLLB_CODES = {
    "en": "eng_Latn", "te": "tel_Telu", "hi": "hin_Deva",
    "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn",
    "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan",
    "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn",
    "ru": "rus_Cyrl",
}

# FIX: Indic languages use subword tokenization β€” fewer words fit in 512 tokens
INDIC_LANGS    = {"te", "hi", "ta", "kn", "ar"}
CHUNK_WORDS    = 80   # default for Latin-script languages
CHUNK_WORDS_INDIC = 50  # reduced for Indic/RTL languages

MODEL_ID   = "facebook/nllb-200-distilled-1.3B"
MAX_TOKENS = 512


class Translator:
    def __init__(self):
        self._pipeline    = None
        self._tokenizer   = None
        self._model       = None
        self._nllb_loaded = False
        print("[Translator] Ready (NLLB loads on first use)")

    # ══════════════════════════════════════════════════════════════════
    # PUBLIC β€” TRANSLATE
    # ══════════════════════════════════════════════════════════════════
    def translate(self, text: str, src_lang: str, tgt_lang: str):
        if not text or not text.strip():
            return "", "skipped (empty)"
        if src_lang == tgt_lang:
            return text, "skipped (same language)"

        if not self._nllb_loaded:
            self._init_nllb()
            self._nllb_loaded = True

        # FIX: Use smaller chunks for Indic languages
        max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS
        chunks    = self._chunk(text, max_words)
        print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars")

        if self._pipeline is not None or self._model is not None:
            try:
                return self._nllb_chunks(chunks, src_lang, tgt_lang)
            except Exception as e:
                logger.warning(f"NLLB failed ({e}), using Google")

        return self._google_chunks(chunks, src_lang, tgt_lang)

    # ══════════════════════════════════════════════════════════════════
    # PUBLIC β€” SUMMARIZE β€” FIXED
    # ══════════════════════════════════════════════════════════════════
    def summarize(self, text: str, max_sentences: int = 5) -> str:
        """
        FIX: Improved extractive summary using position scoring.

        Old approach: picked longest sentences β†’ grabbed run-ons / filler.
        New approach: scores by position (first & last = high value) +
                      length bonus (medium-length sentences preferred).

        Research basis: TextRank & lead-3 heuristics consistently show
        that sentence position is a stronger signal than length alone.
        """
        try:
            # FIX: Include Telugu sentence ending (ΰ₯€) in splitter
            sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
            sentences = [s.strip() for s in sentences if len(s.split()) > 5]

            if len(sentences) <= max_sentences:
                return text

            n = len(sentences)

            # Score each sentence: position + length bonus
            def score(idx, sent):
                pos_score = 0.0
                if idx == 0:
                    pos_score = 1.0    # first sentence = highest value
                elif idx == n - 1:
                    pos_score = 0.7    # last sentence = conclusion
                elif idx <= n * 0.2:
                    pos_score = 0.6    # early sentences
                else:
                    pos_score = 0.3    # middle sentences

                # Prefer medium-length sentences (not too short, not run-ons)
                word_count  = len(sent.split())
                if 10 <= word_count <= 30:
                    len_bonus = 0.3
                elif word_count < 10:
                    len_bonus = 0.0
                else:
                    len_bonus = 0.1   # penalize very long run-ons

                return pos_score + len_bonus

            scored = sorted(
                enumerate(sentences),
                key=lambda x: score(x[0], x[1]),
                reverse=True
            )
            top_indices = sorted([i for i, _ in scored[:max_sentences]])
            summary     = " ".join(sentences[i] for i in top_indices)
            return summary.strip()

        except Exception as e:
            logger.warning(f"Summarize failed: {e}")
            return text[:800] + "..."

    # ══════════════════════════════════════════════════════════════════
    # CHUNKING β€” FIXED (Telugu sentence ending added)
    # ══════════════════════════════════════════════════════════════════
    def _chunk(self, text, max_words):
        # FIX: Added ΰ₯€ (Devanagari/Telugu danda) to sentence split pattern
        sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
        chunks, cur, count = [], [], 0
        for s in sentences:
            w = len(s.split())
            if count + w > max_words and cur:
                chunks.append(" ".join(cur))
                cur, count = [], 0
            cur.append(s)
            count += w
        if cur:
            chunks.append(" ".join(cur))
        return chunks

    # ══════════════════════════════════════════════════════════════════
    # NLLB TRANSLATION
    # ══════════════════════════════════════════════════════════════════
    def _nllb_chunks(self, chunks, src_lang, tgt_lang):
        t0       = time.time()
        src_code = NLLB_CODES.get(src_lang, "eng_Latn")
        tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")
        results  = []

        for i, chunk in enumerate(chunks):
            if not chunk.strip():
                continue
            try:
                if self._pipeline is not None:
                    out = self._pipeline(
                        chunk,
                        src_lang=src_code,
                        tgt_lang=tgt_code,
                        max_length=MAX_TOKENS,
                    )
                    results.append(out[0]["translation_text"])
                else:
                    import torch
                    inputs = self._tokenizer(
                        chunk, return_tensors="pt",
                        padding=True, truncation=True,
                        max_length=MAX_TOKENS,
                    )
                    if torch.cuda.is_available():
                        inputs = {k: v.cuda() for k, v in inputs.items()}
                    tid = self._tokenizer.convert_tokens_to_ids(tgt_code)
                    with torch.no_grad():
                        ids = self._model.generate(
                            **inputs,
                            forced_bos_token_id=tid,
                            max_length=MAX_TOKENS,
                            num_beams=4,
                            early_stopping=True,
                        )
                    results.append(
                        self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0])
            except Exception as e:
                logger.warning(f"Chunk {i+1} NLLB failed: {e}")
                results.append(chunk)

        translated = " ".join(results)
        logger.info(f"NLLB done in {time.time()-t0:.2f}s")
        return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)"

    # ══════════════════════════════════════════════════════════════════
    # GOOGLE FALLBACK
    # ══════════════════════════════════════════════════════════════════
    def _google_chunks(self, chunks, src_lang, tgt_lang):
        t0 = time.time()
        try:
            from deep_translator import GoogleTranslator
            results = []
            for chunk in chunks:
                if not chunk.strip():
                    continue
                out = GoogleTranslator(
                    source=src_lang if src_lang != "auto" else "auto",
                    target=tgt_lang,
                ).translate(chunk)
                results.append(out)
            full = " ".join(results)
            logger.info(f"Google done in {time.time()-t0:.2f}s")
            return full, f"Google Translate ({len(chunks)} chunks)"
        except Exception as e:
            logger.error(f"Google failed: {e}")
            return f"[Translation failed: {e}]", "error"

    # ══════════════════════════════════════════════════════════════════
    # NLLB INIT
    # ══════════════════════════════════════════════════════════════════
    def _init_nllb(self):
        try:
            from transformers import pipeline as hf_pipeline
            self._pipeline = hf_pipeline(
                "translation", model=MODEL_ID,
                device_map="auto", max_length=MAX_TOKENS,
            )
            print(f"[Translator] βœ… {MODEL_ID} pipeline ready")
        except Exception as e:
            logger.warning(f"Pipeline init failed ({e}), trying manual load")
            self._init_nllb_manual()

    def _init_nllb_manual(self):
        try:
            from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
            import torch
            self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
            self._model = AutoModelForSeq2SeqLM.from_pretrained(
                MODEL_ID,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            )
            if torch.cuda.is_available():
                self._model = self._model.cuda()
            self._model.eval()
            print(f"[Translator] βœ… {MODEL_ID} manual load ready")
        except Exception as e:
            logger.error(f"NLLB manual load failed: {e}")