Clearwave48 commited on
Commit
8fa57b8
Β·
verified Β·
1 Parent(s): b3b2c31

Create translator.py

Browse files
Files changed (1) hide show
  1. translator.py +249 -0
translator.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Department 3 β€” Translator
3
+ Primary : NLLB-200-distilled-1.3B (Meta) β€” free local
4
+ Fallback : Google Translate (deep-translator)
5
+
6
+ FIXES APPLIED:
7
+ - Added Telugu/Indic sentence ending (ΰ₯€) to sentence splitter regex
8
+ - Reduced chunk size to 50 words for Indic languages (subword tokenization)
9
+ - Improved summary: uses position scoring (first + last = most informative)
10
+ instead of just picking longest sentences (which picked run-ons)
11
+ """
12
+
13
+ import re
14
+ import time
15
+ import logging
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ NLLB_CODES = {
20
+ "en": "eng_Latn", "te": "tel_Telu", "hi": "hin_Deva",
21
+ "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn",
22
+ "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan",
23
+ "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn",
24
+ "ru": "rus_Cyrl",
25
+ }
26
+
27
+ # FIX: Indic languages use subword tokenization β€” fewer words fit in 512 tokens
28
+ INDIC_LANGS = {"te", "hi", "ta", "kn", "ar"}
29
+ CHUNK_WORDS = 80 # default for Latin-script languages
30
+ CHUNK_WORDS_INDIC = 50 # reduced for Indic/RTL languages
31
+
32
+ MODEL_ID = "facebook/nllb-200-distilled-1.3B"
33
+ MAX_TOKENS = 512
34
+
35
+
36
+ class Translator:
37
+ def __init__(self):
38
+ self._pipeline = None
39
+ self._tokenizer = None
40
+ self._model = None
41
+ self._nllb_loaded = False
42
+ print("[Translator] Ready (NLLB loads on first use)")
43
+
44
+ # ══════════════════════════════════════════════════════════════════
45
+ # PUBLIC β€” TRANSLATE
46
+ # ══════════════════════════════════════════════════════════════════
47
+ def translate(self, text: str, src_lang: str, tgt_lang: str):
48
+ if not text or not text.strip():
49
+ return "", "skipped (empty)"
50
+ if src_lang == tgt_lang:
51
+ return text, "skipped (same language)"
52
+
53
+ if not self._nllb_loaded:
54
+ self._init_nllb()
55
+ self._nllb_loaded = True
56
+
57
+ # FIX: Use smaller chunks for Indic languages
58
+ max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS
59
+ chunks = self._chunk(text, max_words)
60
+ print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars")
61
+
62
+ if self._pipeline is not None or self._model is not None:
63
+ try:
64
+ return self._nllb_chunks(chunks, src_lang, tgt_lang)
65
+ except Exception as e:
66
+ logger.warning(f"NLLB failed ({e}), using Google")
67
+
68
+ return self._google_chunks(chunks, src_lang, tgt_lang)
69
+
70
+ # ══════════════════════════════════════════════════════════════════
71
+ # PUBLIC β€” SUMMARIZE β€” FIXED
72
+ # ══════════════════════════════════════════════════════════════════
73
+ def summarize(self, text: str, max_sentences: int = 5) -> str:
74
+ """
75
+ FIX: Improved extractive summary using position scoring.
76
+
77
+ Old approach: picked longest sentences β†’ grabbed run-ons / filler.
78
+ New approach: scores by position (first & last = high value) +
79
+ length bonus (medium-length sentences preferred).
80
+
81
+ Research basis: TextRank & lead-3 heuristics consistently show
82
+ that sentence position is a stronger signal than length alone.
83
+ """
84
+ try:
85
+ # FIX: Include Telugu sentence ending (ΰ₯€) in splitter
86
+ sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
87
+ sentences = [s.strip() for s in sentences if len(s.split()) > 5]
88
+
89
+ if len(sentences) <= max_sentences:
90
+ return text
91
+
92
+ n = len(sentences)
93
+
94
+ # Score each sentence: position + length bonus
95
+ def score(idx, sent):
96
+ pos_score = 0.0
97
+ if idx == 0:
98
+ pos_score = 1.0 # first sentence = highest value
99
+ elif idx == n - 1:
100
+ pos_score = 0.7 # last sentence = conclusion
101
+ elif idx <= n * 0.2:
102
+ pos_score = 0.6 # early sentences
103
+ else:
104
+ pos_score = 0.3 # middle sentences
105
+
106
+ # Prefer medium-length sentences (not too short, not run-ons)
107
+ word_count = len(sent.split())
108
+ if 10 <= word_count <= 30:
109
+ len_bonus = 0.3
110
+ elif word_count < 10:
111
+ len_bonus = 0.0
112
+ else:
113
+ len_bonus = 0.1 # penalize very long run-ons
114
+
115
+ return pos_score + len_bonus
116
+
117
+ scored = sorted(
118
+ enumerate(sentences),
119
+ key=lambda x: score(x[0], x[1]),
120
+ reverse=True
121
+ )
122
+ top_indices = sorted([i for i, _ in scored[:max_sentences]])
123
+ summary = " ".join(sentences[i] for i in top_indices)
124
+ return summary.strip()
125
+
126
+ except Exception as e:
127
+ logger.warning(f"Summarize failed: {e}")
128
+ return text[:800] + "..."
129
+
130
+ # ══════════════════════════════════════════════════════════════════
131
+ # CHUNKING β€” FIXED (Telugu sentence ending added)
132
+ # ══════════════════════════════════════════════════════════════════
133
+ def _chunk(self, text, max_words):
134
+ # FIX: Added ΰ₯€ (Devanagari/Telugu danda) to sentence split pattern
135
+ sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
136
+ chunks, cur, count = [], [], 0
137
+ for s in sentences:
138
+ w = len(s.split())
139
+ if count + w > max_words and cur:
140
+ chunks.append(" ".join(cur))
141
+ cur, count = [], 0
142
+ cur.append(s)
143
+ count += w
144
+ if cur:
145
+ chunks.append(" ".join(cur))
146
+ return chunks
147
+
148
+ # ══════════════════════════════════════════════════════════════════
149
+ # NLLB TRANSLATION
150
+ # ══════════════════════════════════════════════════════════════════
151
+ def _nllb_chunks(self, chunks, src_lang, tgt_lang):
152
+ t0 = time.time()
153
+ src_code = NLLB_CODES.get(src_lang, "eng_Latn")
154
+ tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")
155
+ results = []
156
+
157
+ for i, chunk in enumerate(chunks):
158
+ if not chunk.strip():
159
+ continue
160
+ try:
161
+ if self._pipeline is not None:
162
+ out = self._pipeline(
163
+ chunk,
164
+ src_lang=src_code,
165
+ tgt_lang=tgt_code,
166
+ max_length=MAX_TOKENS,
167
+ )
168
+ results.append(out[0]["translation_text"])
169
+ else:
170
+ import torch
171
+ inputs = self._tokenizer(
172
+ chunk, return_tensors="pt",
173
+ padding=True, truncation=True,
174
+ max_length=MAX_TOKENS,
175
+ )
176
+ if torch.cuda.is_available():
177
+ inputs = {k: v.cuda() for k, v in inputs.items()}
178
+ tid = self._tokenizer.convert_tokens_to_ids(tgt_code)
179
+ with torch.no_grad():
180
+ ids = self._model.generate(
181
+ **inputs,
182
+ forced_bos_token_id=tid,
183
+ max_length=MAX_TOKENS,
184
+ num_beams=4,
185
+ early_stopping=True,
186
+ )
187
+ results.append(
188
+ self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0])
189
+ except Exception as e:
190
+ logger.warning(f"Chunk {i+1} NLLB failed: {e}")
191
+ results.append(chunk)
192
+
193
+ translated = " ".join(results)
194
+ logger.info(f"NLLB done in {time.time()-t0:.2f}s")
195
+ return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)"
196
+
197
+ # ══════════════════════════════════════════════════════════════════
198
+ # GOOGLE FALLBACK
199
+ # ══════════════════════════════════════════════════════════════════
200
+ def _google_chunks(self, chunks, src_lang, tgt_lang):
201
+ t0 = time.time()
202
+ try:
203
+ from deep_translator import GoogleTranslator
204
+ results = []
205
+ for chunk in chunks:
206
+ if not chunk.strip():
207
+ continue
208
+ out = GoogleTranslator(
209
+ source=src_lang if src_lang != "auto" else "auto",
210
+ target=tgt_lang,
211
+ ).translate(chunk)
212
+ results.append(out)
213
+ full = " ".join(results)
214
+ logger.info(f"Google done in {time.time()-t0:.2f}s")
215
+ return full, f"Google Translate ({len(chunks)} chunks)"
216
+ except Exception as e:
217
+ logger.error(f"Google failed: {e}")
218
+ return f"[Translation failed: {e}]", "error"
219
+
220
+ # ══════════════════════════════════════════════════════════════════
221
+ # NLLB INIT
222
+ # ══════════════════════════════════════════════════════════════════
223
+ def _init_nllb(self):
224
+ try:
225
+ from transformers import pipeline as hf_pipeline
226
+ self._pipeline = hf_pipeline(
227
+ "translation", model=MODEL_ID,
228
+ device_map="auto", max_length=MAX_TOKENS,
229
+ )
230
+ print(f"[Translator] βœ… {MODEL_ID} pipeline ready")
231
+ except Exception as e:
232
+ logger.warning(f"Pipeline init failed ({e}), trying manual load")
233
+ self._init_nllb_manual()
234
+
235
+ def _init_nllb_manual(self):
236
+ try:
237
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
238
+ import torch
239
+ self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
240
+ self._model = AutoModelForSeq2SeqLM.from_pretrained(
241
+ MODEL_ID,
242
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
243
+ )
244
+ if torch.cuda.is_available():
245
+ self._model = self._model.cuda()
246
+ self._model.eval()
247
+ print(f"[Translator] βœ… {MODEL_ID} manual load ready")
248
+ except Exception as e:
249
+ logger.error(f"NLLB manual load failed: {e}")