LiamKhoaLe commited on
Commit
e138b0e
·
1 Parent(s): 62d99f6

Force translation with Llama

Browse files
Files changed (4) hide show
  1. app.py +4 -2
  2. utils/llm.py +15 -6
  3. vi/processing.py +1 -0
  4. vi/translator.py +28 -13
app.py CHANGED
@@ -394,10 +394,12 @@ def _run_job(dataset_key: str, params: ProcessParams):
394
  cache_dir = os.path.abspath("cache/huggingface")
395
  os.makedirs(cache_dir, exist_ok=True)
396
  os.environ["HF_HOME"] = cache_dir
397
-
 
 
398
  vietnamese_translator.load_model()
399
  translator = vietnamese_translator
400
- logger.info("✅ Vietnamese translator loaded successfully")
401
  except Exception as e:
402
  logger.error(f"❌ Failed to load Vietnamese translator: {e}")
403
  logger.warning("Continuing without Vietnamese translation...")
 
394
  cache_dir = os.path.abspath("cache/huggingface")
395
  os.makedirs(cache_dir, exist_ok=True)
396
  os.environ["HF_HOME"] = cache_dir
397
+
398
+ # Pass paraphraser to translator for LLM-based translation
399
+ vietnamese_translator.paraphraser = paraphraser
400
  vietnamese_translator.load_model()
401
  translator = vietnamese_translator
402
+ logger.info("✅ Vietnamese translator loaded successfully with LLM models")
403
  except Exception as e:
404
  logger.error(f"❌ Failed to load Vietnamese translator: {e}")
405
  logger.warning("Continuing without Vietnamese translation...")
utils/llm.py CHANGED
@@ -109,6 +109,7 @@ class NvidiaClient:
109
  data = r.json()
110
  text = data["choices"][0]["message"]["content"]
111
  clean = self._clean_resp(text)
 
112
  logger.info(f"[LLM][NVIDIA] out={snip(clean)}")
113
  return clean
114
  except Exception as e:
@@ -117,11 +118,13 @@ class NvidiaClient:
117
  return None
118
 
119
  class Paraphraser:
120
- """Prefers NVIDIA (cheap), falls back to Gemini. Also offers translate/backtranslate and a tiny consistency judge."""
121
  def __init__(self, nvidia_model: str, gemini_model_easy: str, gemini_model_hard: str):
122
  self.nv = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
123
  self.gm_easy = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_easy)
124
- self.gm_hard = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_hard)
 
 
125
 
126
  # Regex-based cleaning resp from quotes
127
  def _clean_resp(self, resp: str) -> str:
@@ -147,11 +150,17 @@ class Paraphraser:
147
  "Do not fabricate or remove factual claims.\n"
148
  "Return ONLY the rewritten text, without any introduction, commentary.\n"+ text
149
  )
 
150
  out = self.nv.generate(prompt, temperature=0.1, max_tokens=min(600, max(128, len(text)//2)))
151
- if out: return self._clean_resp(out)
152
- gm = self.gm_easy if difficulty == "easy" else self.gm_hard
153
- out = gm.generate(prompt, max_output_tokens=min(600, max(128, len(text)//2)))
154
- return self._clean_resp(out) if out else text
 
 
 
 
 
155
 
156
  # ————— Translate & Backtranslate —————
157
  def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
 
109
  data = r.json()
110
  text = data["choices"][0]["message"]["content"]
111
  clean = self._clean_resp(text)
112
+ # Log the output here
113
  logger.info(f"[LLM][NVIDIA] out={snip(clean)}")
114
  return clean
115
  except Exception as e:
 
118
  return None
119
 
120
  class Paraphraser:
121
+ """Prefers NVIDIA (cheap), falls back to Gemini EASY only. Also offers translate/backtranslate and a tiny consistency judge."""
122
  def __init__(self, nvidia_model: str, gemini_model_easy: str, gemini_model_hard: str):
123
  self.nv = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
124
  self.gm_easy = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_easy)
125
+ # Only use GEMINI_MODEL_EASY, ignore hard model completely
126
+ self.gm_hard = None # Disabled - only use easy model
127
+ logger.info("Paraphraser initialized: NVIDIA -> GEMINI_EASY (GEMINI_HARD disabled)")
128
 
129
  # Regex-based cleaning resp from quotes
130
  def _clean_resp(self, resp: str) -> str:
 
150
  "Do not fabricate or remove factual claims.\n"
151
  "Return ONLY the rewritten text, without any introduction, commentary.\n"+ text
152
  )
153
+ # Always try NVIDIA first
154
  out = self.nv.generate(prompt, temperature=0.1, max_tokens=min(600, max(128, len(text)//2)))
155
+ if out:
156
+ return self._clean_resp(out)
157
+
158
+ # Only fallback to GEMINI_MODEL_EASY (ignore difficulty parameter)
159
+ out = self.gm_easy.generate(prompt, max_output_tokens=min(600, max(128, len(text)//2)))
160
+ if out:
161
+ logger.info(f"[LLM][GEMINI] out={snip(self._clean_resp(out))}")
162
+ return self._clean_resp(out)
163
+ return text
164
 
165
  # ————— Translate & Backtranslate —————
166
  def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
vi/processing.py CHANGED
@@ -145,6 +145,7 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
145
  add_translation_stats(translator._stats, f"sft_{field}", True)
146
  continue
147
 
 
148
  translated = translator.translate_text(original)
149
 
150
  # Debug logging
 
145
  add_translation_stats(translator._stats, f"sft_{field}", True)
146
  continue
147
 
148
+ # Use LLM for Vietnamese translation instead of Opus model
149
  translated = translator.translate_text(original)
150
 
151
  # Debug logging
vi/translator.py CHANGED
@@ -12,19 +12,20 @@ logger = logging.getLogger(__name__)
12
 
13
  class VietnameseTranslator:
14
  """
15
- Vietnamese translator using Helsinki-NLP/opus-mt-en-vi model.
16
 
17
- This class handles translation from English to Vietnamese using the
18
- MarianMT model from Hugging Face Transformers.
19
  """
20
 
21
- def __init__(self, model_name: Optional[str] = None, device: Optional[str] = None):
22
  """
23
  Initialize the Vietnamese translator.
24
 
25
  Args:
26
- model_name: Hugging Face model name. Defaults to EN_VI env var or Helsinki-NLP/opus-mt-en-vi
27
- device: Device to run the model on ('cpu', 'cuda', 'auto'). Defaults to 'auto'
 
28
  """
29
  self.model_name = model_name or os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
30
  self.device = self._get_device(device)
@@ -32,8 +33,9 @@ class VietnameseTranslator:
32
  self.tokenizer = None
33
  self._is_loaded = False
34
  self._stats = {"total_translations": 0, "successful_translations": 0, "failed_translations": 0}
 
35
 
36
- logger.info(f"VietnameseTranslator initialized with model: {self.model_name}")
37
  logger.info(f"Using device: {self.device}")
38
 
39
  def _get_device(self, device: Optional[str]) -> str:
@@ -87,7 +89,7 @@ class VietnameseTranslator:
87
 
88
  def translate_text(self, text: str) -> str:
89
  """
90
- Translate a single text from English to Vietnamese.
91
 
92
  Args:
93
  text: English text to translate
@@ -95,17 +97,30 @@ class VietnameseTranslator:
95
  Returns:
96
  Translated Vietnamese text
97
  """
98
- if not self._is_loaded:
99
- self.load_model()
100
-
101
  if not text or not text.strip():
102
  return text
103
 
104
  try:
105
  self._stats["total_translations"] += 1
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Prepare input with target language token
108
- # The model requires a target language token in the format >>id<<
109
  input_text = f">>vie<< {text.strip()}"
110
 
111
  # Tokenize
@@ -130,7 +145,7 @@ class VietnameseTranslator:
130
  # Decode
131
  translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
132
 
133
- logger.debug(f"Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
134
  logger.debug(f"Are original and translated the same? {text.strip() == translated.strip()}")
135
 
136
  # Track success
 
12
 
13
  class VietnameseTranslator:
14
  """
15
+ Vietnamese translator using LLM models (NVIDIA/Gemini) with Opus as fallback.
16
 
17
+ This class handles translation from English to Vietnamese using LLM models
18
+ for better quality, with Opus model as fallback.
19
  """
20
 
21
+ def __init__(self, model_name: Optional[str] = None, device: Optional[str] = None, paraphraser=None):
22
  """
23
  Initialize the Vietnamese translator.
24
 
25
  Args:
26
+ model_name: Hugging Face model name for fallback. Defaults to EN_VI env var or Helsinki-NLP/opus-mt-en-vi
27
+ device: Device to run the fallback model on ('cpu', 'cuda', 'auto'). Defaults to 'auto'
28
+ paraphraser: Paraphraser instance with LLM models for primary translation
29
  """
30
  self.model_name = model_name or os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
31
  self.device = self._get_device(device)
 
33
  self.tokenizer = None
34
  self._is_loaded = False
35
  self._stats = {"total_translations": 0, "successful_translations": 0, "failed_translations": 0}
36
+ self.paraphraser = paraphraser # LLM-based translator
37
 
38
+ logger.info(f"VietnameseTranslator initialized with LLM models + Opus fallback: {self.model_name}")
39
  logger.info(f"Using device: {self.device}")
40
 
41
  def _get_device(self, device: Optional[str]) -> str:
 
89
 
90
  def translate_text(self, text: str) -> str:
91
  """
92
+ Translate a single text from English to Vietnamese using LLM models first, Opus as fallback.
93
 
94
  Args:
95
  text: English text to translate
 
97
  Returns:
98
  Translated Vietnamese text
99
  """
 
 
 
100
  if not text or not text.strip():
101
  return text
102
 
103
  try:
104
  self._stats["total_translations"] += 1
105
 
106
+ # Try LLM-based translation first (NVIDIA/Gemini)
107
+ if self.paraphraser:
108
+ try:
109
+ translated = self.paraphraser.translate(text, target_lang="vi")
110
+ if translated and translated.strip() and translated.strip() != text.strip():
111
+ logger.debug(f"LLM Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
112
+ self._stats["successful_translations"] += 1
113
+ return translated.strip()
114
+ else:
115
+ logger.debug("LLM translation failed or returned identical text, trying Opus fallback")
116
+ except Exception as e:
117
+ logger.debug(f"LLM translation failed: {e}, trying Opus fallback")
118
+
119
+ # Fallback to Opus model
120
+ if not self._is_loaded:
121
+ self.load_model()
122
+
123
  # Prepare input with target language token
 
124
  input_text = f">>vie<< {text.strip()}"
125
 
126
  # Tokenize
 
145
  # Decode
146
  translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
147
 
148
+ logger.debug(f"Opus Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
149
  logger.debug(f"Are original and translated the same? {text.strip() == translated.strip()}")
150
 
151
  # Track success