testingfaces commited on
Commit
e86fdec
Β·
verified Β·
1 Parent(s): e4cbaa4

Update translator.py

Browse files
Files changed (1) hide show
  1. translator.py +121 -78
translator.py CHANGED
@@ -1,7 +1,12 @@
1
  """
2
  Department 3 - Translator
3
- Primary : NLLB-200-distilled-1.3B (Meta) β€” βœ… UPGRADED from 600M for better accuracy
4
- Fallback : deep-translator (Google Translate) if NLLB fails
 
 
 
 
 
5
  """
6
 
7
  import time
@@ -25,10 +30,9 @@ NLLB_CODES = {
25
  "ru": "rus_Cyrl",
26
  }
27
 
28
- # βœ… UPGRADED: 1.3B is significantly more accurate than 600M,
29
- # especially for Telugu, Tamil, Kannada β€” still runs free on HF CPU
30
- MODEL_ID = "facebook/nllb-200-distilled-1.3B"
31
- MAX_LENGTH = 512
32
 
33
 
34
  class Translator:
@@ -37,35 +41,131 @@ class Translator:
37
  self._tokenizer = None
38
  self._model = None
39
  self._nllb_loaded = False
40
- # βœ… LAZY LOAD: Don't load 2.5GB model on startup
41
- # Loads automatically on first translation request instead
42
  print("[Translator] Ready (NLLB loads on first use)")
43
 
44
- # ── Public ──────────────────────────────────────────────────────
45
  def translate(self, text: str, src_lang: str, tgt_lang: str):
46
- """
47
- Returns (translated_text, method_label).
48
- src_lang / tgt_lang are 2-letter codes (en, te, hi, ...).
49
- """
50
  if not text or not text.strip():
51
  return "", "skipped (empty)"
52
-
53
  if src_lang == tgt_lang:
54
  return text, "skipped (same language)"
55
 
 
56
  if not self._nllb_loaded:
57
  self._init_nllb()
58
  self._nllb_loaded = True
59
 
 
 
 
 
60
  if self._pipeline is not None or self._model is not None:
61
  try:
62
- return self._translate_nllb(text, src_lang, tgt_lang)
63
  except Exception as e:
64
  logger.warning(f"[Translator] NLLB failed ({e}), trying Google...")
65
 
66
- return self._translate_google(text, src_lang, tgt_lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # ── NLLB-200 ────────────────────────────────────────────────────
69
  def _init_nllb(self):
70
  try:
71
  from transformers import pipeline as hf_pipeline
@@ -75,9 +175,9 @@ class Translator:
75
  device_map="auto",
76
  max_length=MAX_LENGTH,
77
  )
78
- print(f"[Translator] βœ… {MODEL_ID} loaded via pipeline")
79
  except Exception as e:
80
- logger.warning(f"[Translator] pipeline init failed, trying manual load: {e}")
81
  self._init_nllb_manual()
82
 
83
  def _init_nllb_manual(self):
@@ -94,62 +194,5 @@ class Translator:
94
  self._model.eval()
95
  print(f"[Translator] βœ… {MODEL_ID} loaded manually")
96
  except Exception as e:
97
- logger.error(f"[Translator] NLLB manual load also failed: {e}")
98
- self._model = None
99
-
100
- def _translate_nllb(self, text: str, src_lang: str, tgt_lang: str):
101
- t0 = time.time()
102
- src_code = NLLB_CODES.get(src_lang, "eng_Latn")
103
- tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")
104
-
105
- if self._pipeline is not None:
106
- result = self._pipeline(
107
- text,
108
- src_lang=src_code,
109
- tgt_lang=tgt_code,
110
- max_length=MAX_LENGTH,
111
- )
112
- translated = result[0]["translation_text"]
113
- else:
114
- import torch
115
- inputs = self._tokenizer(
116
- text,
117
- return_tensors="pt",
118
- padding=True,
119
- truncation=True,
120
- max_length=MAX_LENGTH,
121
- )
122
- if torch.cuda.is_available():
123
- inputs = {k: v.cuda() for k, v in inputs.items()}
124
-
125
- tgt_lang_id = self._tokenizer.convert_tokens_to_ids(tgt_code)
126
- with torch.no_grad():
127
- output_ids = self._model.generate(
128
- **inputs,
129
- forced_bos_token_id=tgt_lang_id,
130
- max_length=MAX_LENGTH,
131
- num_beams=4,
132
- early_stopping=True,
133
- )
134
- translated = self._tokenizer.batch_decode(
135
- output_ids, skip_special_tokens=True
136
- )[0]
137
-
138
- elapsed = time.time() - t0
139
- logger.info(f"[Translator] NLLB done in {elapsed:.2f}s: {src_code} -> {tgt_code}")
140
- return translated, "NLLB-200-distilled-1.3B"
141
-
142
- # ── Google Translate fallback ───────────────────────────────���────
143
- def _translate_google(self, text: str, src_lang: str, tgt_lang: str):
144
- t0 = time.time()
145
- try:
146
- from deep_translator import GoogleTranslator
147
- translated = GoogleTranslator(
148
- source=src_lang if src_lang != "auto" else "auto",
149
- target=tgt_lang,
150
- ).translate(text)
151
- logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s")
152
- return translated, "Google Translate (fallback)"
153
- except Exception as e:
154
- logger.error(f"[Translator] Google fallback also failed: {e}")
155
- return f"[Translation failed: {str(e)}]", "error"
 
1
  """
2
  Department 3 - Translator
3
+ Primary : NLLB-200-distilled-1.3B (Meta)
4
+ Fallback : deep-translator (Google Translate)
5
+
6
+ βœ… UPGRADED:
7
+ - Text chunking for long transcripts (fixes repetition bug)
8
+ - Splits by sentence, translates in 400-token chunks
9
+ - Rejoins cleanly into full translation
10
  """
11
 
12
  import time
 
30
  "ru": "rus_Cyrl",
31
  }
32
 
33
+ MODEL_ID = "facebook/nllb-200-distilled-1.3B"
34
+ MAX_LENGTH = 512
35
+ CHUNK_WORDS = 80 # ~400 tokens, safe for NLLB
 
36
 
37
 
38
  class Translator:
 
41
  self._tokenizer = None
42
  self._model = None
43
  self._nllb_loaded = False
 
 
44
  print("[Translator] Ready (NLLB loads on first use)")
45
 
46
+ # ── Public ───────────────────────────────────────────────────────
47
  def translate(self, text: str, src_lang: str, tgt_lang: str):
 
 
 
 
48
  if not text or not text.strip():
49
  return "", "skipped (empty)"
 
50
  if src_lang == tgt_lang:
51
  return text, "skipped (same language)"
52
 
53
+ # Load NLLB on first use
54
  if not self._nllb_loaded:
55
  self._init_nllb()
56
  self._nllb_loaded = True
57
 
58
+ # Split long text into chunks
59
+ chunks = self._split_into_chunks(text, CHUNK_WORDS)
60
+ print(f"[Translator] Translating {len(chunks)} chunks ({len(text)} chars)")
61
+
62
  if self._pipeline is not None or self._model is not None:
63
  try:
64
+ return self._translate_chunks_nllb(chunks, src_lang, tgt_lang)
65
  except Exception as e:
66
  logger.warning(f"[Translator] NLLB failed ({e}), trying Google...")
67
 
68
+ return self._translate_chunks_google(chunks, src_lang, tgt_lang)
69
+
70
+ # ── Chunking ─────────────────────────────────────────────────────
71
+ def _split_into_chunks(self, text: str, max_words: int):
72
+ """Split text into sentence-aware chunks of max_words words."""
73
+ # Split by sentence endings
74
+ import re
75
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
76
+
77
+ chunks = []
78
+ current = []
79
+ count = 0
80
+
81
+ for sentence in sentences:
82
+ words = sentence.split()
83
+ if count + len(words) > max_words and current:
84
+ chunks.append(" ".join(current))
85
+ current = []
86
+ count = 0
87
+ current.append(sentence)
88
+ count += len(words)
89
+
90
+ if current:
91
+ chunks.append(" ".join(current))
92
+
93
+ return chunks
94
+
95
+ # ── NLLB chunked translation ──────────────────────────────────────
96
+ def _translate_chunks_nllb(self, chunks, src_lang, tgt_lang):
97
+ t0 = time.time()
98
+ results = []
99
+ src_code = NLLB_CODES.get(src_lang, "eng_Latn")
100
+ tgt_code = NLLB_CODES.get(tgt_lang, "tel_Telu")
101
+
102
+ for i, chunk in enumerate(chunks):
103
+ if not chunk.strip():
104
+ continue
105
+ try:
106
+ if self._pipeline is not None:
107
+ result = self._pipeline(
108
+ chunk,
109
+ src_lang=src_code,
110
+ tgt_lang=tgt_code,
111
+ max_length=MAX_LENGTH,
112
+ )
113
+ results.append(result[0]["translation_text"])
114
+ else:
115
+ import torch
116
+ inputs = self._tokenizer(
117
+ chunk,
118
+ return_tensors="pt",
119
+ padding=True,
120
+ truncation=True,
121
+ max_length=MAX_LENGTH,
122
+ )
123
+ if torch.cuda.is_available():
124
+ inputs = {k: v.cuda() for k, v in inputs.items()}
125
+ tgt_lang_id = self._tokenizer.convert_tokens_to_ids(tgt_code)
126
+ with torch.no_grad():
127
+ output_ids = self._model.generate(
128
+ **inputs,
129
+ forced_bos_token_id=tgt_lang_id,
130
+ max_length=MAX_LENGTH,
131
+ num_beams=4,
132
+ early_stopping=True,
133
+ )
134
+ translated = self._tokenizer.batch_decode(
135
+ output_ids, skip_special_tokens=True)[0]
136
+ results.append(translated)
137
+ except Exception as e:
138
+ logger.warning(f"[Translator] Chunk {i+1} failed: {e}")
139
+ results.append(chunk) # fallback: keep original
140
+
141
+ translated = " ".join(results)
142
+ elapsed = time.time() - t0
143
+ logger.info(f"[Translator] NLLB done in {elapsed:.2f}s: {src_code}->{tgt_code}")
144
+ print(f"[Translator] βœ… Done in {elapsed:.2f}s ({len(chunks)} chunks)")
145
+ return translated, f"NLLB-200-distilled-1.3B ({len(chunks)} chunks)"
146
+
147
+ # ── Google chunked translation ────────────────────────────────────
148
+ def _translate_chunks_google(self, chunks, src_lang, tgt_lang):
149
+ t0 = time.time()
150
+ try:
151
+ from deep_translator import GoogleTranslator
152
+ results = []
153
+ for chunk in chunks:
154
+ if not chunk.strip():
155
+ continue
156
+ translated = GoogleTranslator(
157
+ source=src_lang if src_lang != "auto" else "auto",
158
+ target=tgt_lang,
159
+ ).translate(chunk)
160
+ results.append(translated)
161
+ full = " ".join(results)
162
+ logger.info(f"[Translator] Google done in {time.time()-t0:.2f}s")
163
+ return full, f"Google Translate ({len(chunks)} chunks)"
164
+ except Exception as e:
165
+ logger.error(f"[Translator] Google fallback failed: {e}")
166
+ return f"[Translation failed: {str(e)}]", "error"
167
 
168
+ # ── NLLB init ────────────────────────────────────────────────────
169
  def _init_nllb(self):
170
  try:
171
  from transformers import pipeline as hf_pipeline
 
175
  device_map="auto",
176
  max_length=MAX_LENGTH,
177
  )
178
+ print(f"[Translator] βœ… {MODEL_ID} loaded")
179
  except Exception as e:
180
+ logger.warning(f"[Translator] Pipeline init failed: {e}, trying manual...")
181
  self._init_nllb_manual()
182
 
183
  def _init_nllb_manual(self):
 
194
  self._model.eval()
195
  print(f"[Translator] βœ… {MODEL_ID} loaded manually")
196
  except Exception as e:
197
+ logger.error(f"[Translator] NLLB manual load failed: {e}")
198
+ self._model = None