Clearwave48 commited on
Commit
47ab527
Β·
verified Β·
1 Parent(s): 36576b6

Update translator.py

Browse files
Files changed (1) hide show
  1. translator.py +110 -47
translator.py CHANGED
@@ -1,13 +1,28 @@
1
  """
2
- Department 3 β€” Translator
 
3
  Primary : NLLB-200-distilled-1.3B (Meta) β€” free local
4
  Fallback : Google Translate (deep-translator)
5
 
6
- FIXES APPLIED:
7
  - Added Telugu/Indic sentence ending (ΰ₯€) to sentence splitter regex
8
  - Reduced chunk size to 50 words for Indic languages (subword tokenization)
9
  - Improved summary: uses position scoring (first + last = most informative)
10
  instead of just picking longest sentences (which picked run-ons)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
 
13
  import re
@@ -21,17 +36,23 @@ NLLB_CODES = {
21
  "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn",
22
  "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan",
23
  "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn",
24
- "ru": "rus_Cyrl",
 
 
 
25
  }
26
 
27
- # FIX: Indic languages use subword tokenization β€” fewer words fit in 512 tokens
28
- INDIC_LANGS = {"te", "hi", "ta", "kn", "ar"}
29
- CHUNK_WORDS = 80 # default for Latin-script languages
30
- CHUNK_WORDS_INDIC = 50 # reduced for Indic/RTL languages
31
 
32
  MODEL_ID = "facebook/nllb-200-distilled-1.3B"
33
  MAX_TOKENS = 512
34
 
 
 
 
35
 
36
  class Translator:
37
  def __init__(self):
@@ -45,6 +66,12 @@ class Translator:
45
  # PUBLIC β€” TRANSLATE
46
  # ══════════════════════════════════════════════════════════════════
47
  def translate(self, text: str, src_lang: str, tgt_lang: str):
 
 
 
 
 
 
48
  if not text or not text.strip():
49
  return "", "skipped (empty)"
50
  if src_lang == tgt_lang:
@@ -54,46 +81,62 @@ class Translator:
54
  self._init_nllb()
55
  self._nllb_loaded = True
56
 
57
- # FIX: Use smaller chunks for Indic languages
 
 
 
 
 
 
 
 
 
 
 
58
  max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS
59
  chunks = self._chunk(text, max_words)
60
  print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars")
61
 
62
- if self._pipeline is not None or self._model is not None:
63
- try:
64
- return self._nllb_chunks(chunks, src_lang, tgt_lang)
65
- except Exception as e:
66
- logger.warning(f"NLLB failed ({e}), using Google")
 
67
 
68
- return self._google_chunks(chunks, src_lang, tgt_lang)
 
 
 
 
69
 
70
  # ══════════════════════════════════════════════════════════════════
71
- # PUBLIC β€” SUMMARIZE β€” FIXED
72
  # ══════════════════════════════════════════════════════════════════
73
  def summarize(self, text: str, max_sentences: int = 5) -> str:
74
  """
75
- FIX: Improved extractive summary using position scoring.
76
 
77
- Old approach: picked longest sentences β†’ grabbed run-ons / filler.
78
- New approach: scores by position (first & last = high value) +
79
- length bonus (medium-length sentences preferred).
80
 
81
- Research basis: TextRank & lead-3 heuristics consistently show
82
- that sentence position is a stronger signal than length alone.
83
  """
84
  try:
85
- # FIX: Include Telugu sentence ending (ΰ₯€) in splitter
86
  sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
87
  sentences = [s.strip() for s in sentences if len(s.split()) > 5]
88
 
 
 
 
89
  if len(sentences) <= max_sentences:
90
  return text
91
 
92
  n = len(sentences)
93
 
94
- # Score each sentence: position + length bonus
95
  def score(idx, sent):
96
- pos_score = 0.0
97
  if idx == 0:
98
  pos_score = 1.0 # first sentence = highest value
99
  elif idx == n - 1:
@@ -103,35 +146,34 @@ class Translator:
103
  else:
104
  pos_score = 0.3 # middle sentences
105
 
106
- # Prefer medium-length sentences (not too short, not run-ons)
107
- word_count = len(sent.split())
108
  if 10 <= word_count <= 30:
109
- len_bonus = 0.3
110
  elif word_count < 10:
111
- len_bonus = 0.0
112
  else:
113
- len_bonus = 0.1 # penalize very long run-ons
114
 
115
  return pos_score + len_bonus
116
 
117
- scored = sorted(
118
- enumerate(sentences),
119
- key=lambda x: score(x[0], x[1]),
120
- reverse=True
121
- )
122
  top_indices = sorted([i for i, _ in scored[:max_sentences]])
123
  summary = " ".join(sentences[i] for i in top_indices)
124
  return summary.strip()
125
 
126
  except Exception as e:
127
- logger.warning(f"Summarize failed: {e}")
128
- return text[:800] + "..."
 
129
 
130
  # ══════════════════════════════════════════════════════════════════
131
- # CHUNKING β€” FIXED (Telugu sentence ending added)
132
  # ══════════════════════════════════════════════════════════════════
133
  def _chunk(self, text, max_words):
134
- # FIX: Added ΰ₯€ (Devanagari/Telugu danda) to sentence split pattern
 
 
 
135
  sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
136
  chunks, cur, count = [], [], 0
137
  for s in sentences:
@@ -185,13 +227,14 @@ class Translator:
185
  early_stopping=True,
186
  )
187
  results.append(
188
- self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0])
 
189
  except Exception as e:
190
- logger.warning(f"Chunk {i+1} NLLB failed: {e}")
191
- results.append(chunk)
192
 
193
  translated = " ".join(results)
194
- logger.info(f"NLLB done in {time.time()-t0:.2f}s")
195
  return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)"
196
 
197
  # ══════════════════════════════════════════════════════════════════
@@ -211,10 +254,10 @@ class Translator:
211
  ).translate(chunk)
212
  results.append(out)
213
  full = " ".join(results)
214
- logger.info(f"Google done in {time.time()-t0:.2f}s")
215
  return full, f"Google Translate ({len(chunks)} chunks)"
216
  except Exception as e:
217
- logger.error(f"Google failed: {e}")
218
  return f"[Translation failed: {e}]", "error"
219
 
220
  # ══════════════════════════════════════════════════════════════════
@@ -229,7 +272,7 @@ class Translator:
229
  )
230
  print(f"[Translator] βœ… {MODEL_ID} pipeline ready")
231
  except Exception as e:
232
- logger.warning(f"Pipeline init failed ({e}), trying manual load")
233
  self._init_nllb_manual()
234
 
235
  def _init_nllb_manual(self):
@@ -237,7 +280,7 @@ class Translator:
237
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
238
  import torch
239
  self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
240
- self._model = AutoModelForSeq2SeqLM.from_pretrained(
241
  MODEL_ID,
242
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
243
  )
@@ -246,4 +289,24 @@ class Translator:
246
  self._model.eval()
247
  print(f"[Translator] βœ… {MODEL_ID} manual load ready")
248
  except Exception as e:
249
- logger.error(f"NLLB manual load failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ ClearWave β€” Translator
3
+ =======================
4
  Primary : NLLB-200-distilled-1.3B (Meta) β€” free local
5
  Fallback : Google Translate (deep-translator)
6
 
7
+ FIXES APPLIED (original):
8
  - Added Telugu/Indic sentence ending (ΰ₯€) to sentence splitter regex
9
  - Reduced chunk size to 50 words for Indic languages (subword tokenization)
10
  - Improved summary: uses position scoring (first + last = most informative)
11
  instead of just picking longest sentences (which picked run-ons)
12
+
13
+ BUGS FIXED (v2):
14
+ [BUG-5] NLLB silently skipped with no log when both _pipeline and _model
15
+ are None after failed init β†’ impossible to diagnose in production
16
+ β†’ Fix: explicit warning log before falling through to Google
17
+
18
+ [BUG-6] Unknown src_lang codes from transcriber (e.g. "be" for Bengali
19
+ due to _norm() fallback) silently defaulted to "eng_Latn" in
20
+ NLLB_CODES.get(), causing mistranslation with no warning
21
+ β†’ Fix: warn explicitly when src_lang or tgt_lang not in NLLB_CODES
22
+
23
+ [BUG-9] summarize() fallback truncated at hard char index 800, cutting
24
+ mid-sentence and producing incomplete output
25
+ β†’ Fix: truncate at last sentence boundary (last '.' before limit)
26
  """
27
 
28
  import re
 
36
  "ta": "tam_Taml", "kn": "kan_Knda", "es": "spa_Latn",
37
  "fr": "fra_Latn", "de": "deu_Latn", "ja": "jpn_Jpan",
38
  "zh": "zho_Hans", "ar": "arb_Arab", "pt": "por_Latn",
39
+ "ru": "rus_Cyrl", "it": "ita_Latn", "nl": "nld_Latn",
40
+ "pl": "pol_Latn", "sv": "swe_Latn", "tr": "tur_Latn",
41
+ "bn": "ben_Beng", "ur": "urd_Arab", "ko": "kor_Hang",
42
+ "vi": "vie_Latn", "ms": "zsm_Latn", "id": "ind_Latn",
43
  }
44
 
45
+ # Indic/RTL languages use subword tokenization β€” fewer words fit in 512 tokens
46
+ INDIC_LANGS = {"te", "hi", "ta", "kn", "ar", "bn", "ur"}
47
+ CHUNK_WORDS = 80 # default for Latin-script languages
48
+ CHUNK_WORDS_INDIC = 50 # reduced for Indic/RTL languages
49
 
50
  MODEL_ID = "facebook/nllb-200-distilled-1.3B"
51
  MAX_TOKENS = 512
52
 
53
+ # Hard char limit for summarize() fallback truncation
54
+ SUMMARY_FALLBACK_CHARS = 800
55
+
56
 
57
  class Translator:
58
  def __init__(self):
 
66
  # PUBLIC β€” TRANSLATE
67
  # ══════════════════════════════════════════════════════════════════
68
  def translate(self, text: str, src_lang: str, tgt_lang: str):
69
+ """
70
+ Returns (translated_text, method_label).
71
+
72
+ BUG-6 FIX: warns when src_lang or tgt_lang is not in NLLB_CODES so
73
+ mistranslation is visible in logs rather than silently defaulting.
74
+ """
75
  if not text or not text.strip():
76
  return "", "skipped (empty)"
77
  if src_lang == tgt_lang:
 
81
  self._init_nllb()
82
  self._nllb_loaded = True
83
 
84
+ # BUG-6 FIX: warn on unknown language codes before translation attempt
85
+ if src_lang not in NLLB_CODES:
86
+ logger.warning(
87
+ f"[Translator] src_lang '{src_lang}' not in NLLB_CODES β€” "
88
+ f"will default to eng_Latn. Add it to NLLB_CODES if incorrect."
89
+ )
90
+ if tgt_lang not in NLLB_CODES:
91
+ logger.warning(
92
+ f"[Translator] tgt_lang '{tgt_lang}' not in NLLB_CODES β€” "
93
+ f"will default to tel_Telu. Add it to NLLB_CODES if incorrect."
94
+ )
95
+
96
  max_words = CHUNK_WORDS_INDIC if src_lang in INDIC_LANGS else CHUNK_WORDS
97
  chunks = self._chunk(text, max_words)
98
  print(f"[Translator] {len(chunks)} chunks ({max_words} words each), {len(text)} chars")
99
 
100
+ # BUG-5 FIX: explicit log when NLLB is unavailable, not silent skip
101
+ if self._pipeline is None and self._model is None:
102
+ logger.warning(
103
+ "[Translator] NLLB not loaded (init failed) β€” using Google Translate directly"
104
+ )
105
+ return self._google_chunks(chunks, src_lang, tgt_lang)
106
 
107
+ try:
108
+ return self._nllb_chunks(chunks, src_lang, tgt_lang)
109
+ except Exception as e:
110
+ logger.warning(f"[Translator] NLLB failed ({e}) β€” falling back to Google Translate")
111
+ return self._google_chunks(chunks, src_lang, tgt_lang)
112
 
113
  # ══════════════════════════════════════════════════════════════════
114
+ # PUBLIC β€” SUMMARIZE
115
  # ══════════════════════════════════════════════════════════════════
116
  def summarize(self, text: str, max_sentences: int = 5) -> str:
117
  """
118
+ Extractive summary using position scoring.
119
 
120
+ Scores by position (first & last = high value) + length bonus
121
+ (medium-length sentences preferred over run-ons).
 
122
 
123
+ BUG-9 FIX: fallback truncation now cuts at last sentence boundary
124
+ instead of hard char index, preventing incomplete mid-sentence output.
125
  """
126
  try:
127
+ # Include Telugu/Indic sentence ending (ΰ₯€) in splitter
128
  sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
129
  sentences = [s.strip() for s in sentences if len(s.split()) > 5]
130
 
131
+ if not sentences:
132
+ return text
133
+
134
  if len(sentences) <= max_sentences:
135
  return text
136
 
137
  n = len(sentences)
138
 
 
139
  def score(idx, sent):
 
140
  if idx == 0:
141
  pos_score = 1.0 # first sentence = highest value
142
  elif idx == n - 1:
 
146
  else:
147
  pos_score = 0.3 # middle sentences
148
 
149
+ word_count = len(sent.split())
 
150
  if 10 <= word_count <= 30:
151
+ len_bonus = 0.3 # ideal length
152
  elif word_count < 10:
153
+ len_bonus = 0.0 # too short
154
  else:
155
+ len_bonus = 0.1 # penalise run-ons
156
 
157
  return pos_score + len_bonus
158
 
159
+ scored = sorted(enumerate(sentences), key=lambda x: score(x[0], x[1]), reverse=True)
 
 
 
 
160
  top_indices = sorted([i for i, _ in scored[:max_sentences]])
161
  summary = " ".join(sentences[i] for i in top_indices)
162
  return summary.strip()
163
 
164
  except Exception as e:
165
+ logger.warning(f"[Translator] Summarize failed: {e}")
166
+ # BUG-9 FIX: truncate at last sentence boundary, not hard char index
167
+ return self._safe_truncate(text, SUMMARY_FALLBACK_CHARS)
168
 
169
  # ══════════════════════════════════════════════════════════════════
170
+ # CHUNKING
171
  # ══════════════════════════════════════════════════════════════════
172
  def _chunk(self, text, max_words):
173
+ """
174
+ Split text into word-count-bounded chunks, respecting sentence
175
+ boundaries where possible. Handles Indic danda (ΰ₯€) as sentence end.
176
+ """
177
  sentences = re.split(r'(?<=[.!?ΰ₯€])\s+', text.strip())
178
  chunks, cur, count = [], [], 0
179
  for s in sentences:
 
227
  early_stopping=True,
228
  )
229
  results.append(
230
+ self._tokenizer.batch_decode(ids, skip_special_tokens=True)[0]
231
+ )
232
  except Exception as e:
233
+ logger.warning(f"[Translator] Chunk {i+1} NLLB failed: {e} β€” keeping original")
234
+ results.append(chunk) # degrade gracefully per-chunk
235
 
236
  translated = " ".join(results)
237
+ logger.info(f"[Translator] NLLB done in {time.time()-t0:.2f}s")
238
  return translated, f"NLLB-200-1.3B ({len(chunks)} chunks)"
239
 
240
  # ══════════════════════════════════════════════════════════════════
 
254
  ).translate(chunk)
255
  results.append(out)
256
  full = " ".join(results)
257
+ logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s")
258
  return full, f"Google Translate ({len(chunks)} chunks)"
259
  except Exception as e:
260
+ logger.error(f"[Translator] Google failed: {e}")
261
  return f"[Translation failed: {e}]", "error"
262
 
263
  # ══════════════════════════════════════════════════════════════════
 
272
  )
273
  print(f"[Translator] βœ… {MODEL_ID} pipeline ready")
274
  except Exception as e:
275
+ logger.warning(f"[Translator] Pipeline init failed ({e}), trying manual load")
276
  self._init_nllb_manual()
277
 
278
  def _init_nllb_manual(self):
 
280
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
281
  import torch
282
  self._tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
283
+ self._model = AutoModelForSeq2SeqLM.from_pretrained(
284
  MODEL_ID,
285
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
286
  )
 
289
  self._model.eval()
290
  print(f"[Translator] βœ… {MODEL_ID} manual load ready")
291
  except Exception as e:
292
+ logger.error(f"[Translator] NLLB manual load also failed: {e}")
293
+ # Both init paths exhausted β€” _pipeline and _model remain None.
294
+ # translate() will detect this and route directly to Google.
295
+
296
+ # ══════════════════════════════════════════════════════════════════
297
+ # HELPERS
298
+ # ══════════════════════════════════════════════════════════════════
299
+ @staticmethod
300
+ def _safe_truncate(text: str, max_chars: int) -> str:
301
+ """
302
+ BUG-9 FIX: Truncate text at the last sentence boundary within
303
+ max_chars, avoiding mid-sentence cuts. Falls back to hard truncation
304
+ only if no sentence boundary exists within the limit.
305
+ """
306
+ if len(text) <= max_chars:
307
+ return text
308
+ window = text[:max_chars]
309
+ last_period = max(window.rfind('.'), window.rfind('!'), window.rfind('?'))
310
+ if last_period > max_chars * 0.5: # boundary found in reasonable range
311
+ return window[:last_period + 1]
312
+ return window + "..."