Spaces:

MedVietAI
/

processing

Sleeping

App Files Files Community

LiamKhoaLe commited on Oct 8, 2025

Commit

e138b0e

1 Parent(s): 62d99f6

Force translation with Llama

Browse files

Files changed (4) hide show

app.py +4 -2
utils/llm.py +15 -6
vi/processing.py +1 -0
vi/translator.py +28 -13

app.py CHANGED Viewed

@@ -394,10 +394,12 @@ def _run_job(dataset_key: str, params: ProcessParams):
                 cache_dir = os.path.abspath("cache/huggingface")
                 os.makedirs(cache_dir, exist_ok=True)
                 os.environ["HF_HOME"] = cache_dir
                 vietnamese_translator.load_model()
                 translator = vietnamese_translator
-                logger.info("✅ Vietnamese translator loaded successfully")
             except Exception as e:
                 logger.error(f"❌ Failed to load Vietnamese translator: {e}")
                 logger.warning("Continuing without Vietnamese translation...")

                 cache_dir = os.path.abspath("cache/huggingface")
                 os.makedirs(cache_dir, exist_ok=True)
                 os.environ["HF_HOME"] = cache_dir
+                # Pass paraphraser to translator for LLM-based translation
+                vietnamese_translator.paraphraser = paraphraser
                 vietnamese_translator.load_model()
                 translator = vietnamese_translator
+                logger.info("✅ Vietnamese translator loaded successfully with LLM models")
             except Exception as e:
                 logger.error(f"❌ Failed to load Vietnamese translator: {e}")
                 logger.warning("Continuing without Vietnamese translation...")

utils/llm.py CHANGED Viewed

@@ -109,6 +109,7 @@ class NvidiaClient:
             data = r.json()
             text = data["choices"][0]["message"]["content"]
             clean = self._clean_resp(text)
             logger.info(f"[LLM][NVIDIA] out={snip(clean)}")
             return clean
         except Exception as e:
@@ -117,11 +118,13 @@ class NvidiaClient:
             return None
 class Paraphraser:
-    """Prefers NVIDIA (cheap), falls back to Gemini. Also offers translate/backtranslate and a tiny consistency judge."""
     def __init__(self, nvidia_model: str, gemini_model_easy: str, gemini_model_hard: str):
         self.nv = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
         self.gm_easy = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_easy)
-        self.gm_hard = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_hard)
     # Regex-based cleaning resp from quotes
     def _clean_resp(self, resp: str) -> str:
@@ -147,11 +150,17 @@ class Paraphraser:
             "Do not fabricate or remove factual claims.\n"
             "Return ONLY the rewritten text, without any introduction, commentary.\n"+ text
         )
         out = self.nv.generate(prompt, temperature=0.1, max_tokens=min(600, max(128, len(text)//2)))
-        if out: return self._clean_resp(out)
-        gm = self.gm_easy if difficulty == "easy" else self.gm_hard
-        out = gm.generate(prompt, max_output_tokens=min(600, max(128, len(text)//2)))
-        return self._clean_resp(out) if out else text
     # ————— Translate & Backtranslate —————
     def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:

             data = r.json()
             text = data["choices"][0]["message"]["content"]
             clean = self._clean_resp(text)
+            # Log the output here
             logger.info(f"[LLM][NVIDIA] out={snip(clean)}")
             return clean
         except Exception as e:
             return None
 class Paraphraser:
+    """Prefers NVIDIA (cheap), falls back to Gemini EASY only. Also offers translate/backtranslate and a tiny consistency judge."""
     def __init__(self, nvidia_model: str, gemini_model_easy: str, gemini_model_hard: str):
         self.nv = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
         self.gm_easy = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_easy)
+        # Only use GEMINI_MODEL_EASY, ignore hard model completely
+        self.gm_hard = None  # Disabled - only use easy model
+        logger.info("Paraphraser initialized: NVIDIA -> GEMINI_EASY (GEMINI_HARD disabled)")
     # Regex-based cleaning resp from quotes
     def _clean_resp(self, resp: str) -> str:
             "Do not fabricate or remove factual claims.\n"
             "Return ONLY the rewritten text, without any introduction, commentary.\n"+ text
         )
+        # Always try NVIDIA first
         out = self.nv.generate(prompt, temperature=0.1, max_tokens=min(600, max(128, len(text)//2)))
+        if out:
+            return self._clean_resp(out)
+        # Only fallback to GEMINI_MODEL_EASY (ignore difficulty parameter)
+        out = self.gm_easy.generate(prompt, max_output_tokens=min(600, max(128, len(text)//2)))
+        if out:
+            logger.info(f"[LLM][GEMINI] out={snip(self._clean_resp(out))}")
+            return self._clean_resp(out)
+        return text
     # ————— Translate & Backtranslate —————
     def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:

vi/processing.py CHANGED Viewed

@@ -145,6 +145,7 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
                             add_translation_stats(translator._stats, f"sft_{field}", True)
                         continue
                     translated = translator.translate_text(original)
                     # Debug logging

                             add_translation_stats(translator._stats, f"sft_{field}", True)
                         continue
+                    # Use LLM for Vietnamese translation instead of Opus model
                     translated = translator.translate_text(original)
                     # Debug logging

vi/translator.py CHANGED Viewed

@@ -12,19 +12,20 @@ logger = logging.getLogger(__name__)
 class VietnameseTranslator:
     """
-    Vietnamese translator using Helsinki-NLP/opus-mt-en-vi model.
-    This class handles translation from English to Vietnamese using the
-    MarianMT model from Hugging Face Transformers.
     """
-    def __init__(self, model_name: Optional[str] = None, device: Optional[str] = None):
         """
         Initialize the Vietnamese translator.
         Args:
-            model_name: Hugging Face model name. Defaults to EN_VI env var or Helsinki-NLP/opus-mt-en-vi
-            device: Device to run the model on ('cpu', 'cuda', 'auto'). Defaults to 'auto'
         """
         self.model_name = model_name or os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
         self.device = self._get_device(device)
@@ -32,8 +33,9 @@ class VietnameseTranslator:
         self.tokenizer = None
         self._is_loaded = False
         self._stats = {"total_translations": 0, "successful_translations": 0, "failed_translations": 0}
-        logger.info(f"VietnameseTranslator initialized with model: {self.model_name}")
         logger.info(f"Using device: {self.device}")
     def _get_device(self, device: Optional[str]) -> str:
@@ -87,7 +89,7 @@ class VietnameseTranslator:
     def translate_text(self, text: str) -> str:
         """
-        Translate a single text from English to Vietnamese.
         Args:
             text: English text to translate
@@ -95,17 +97,30 @@ class VietnameseTranslator:
         Returns:
             Translated Vietnamese text
         """
-        if not self._is_loaded:
-            self.load_model()
         if not text or not text.strip():
             return text
         try:
             self._stats["total_translations"] += 1
             # Prepare input with target language token
-            # The model requires a target language token in the format >>id<<
             input_text = f">>vie<< {text.strip()}"
             # Tokenize
@@ -130,7 +145,7 @@ class VietnameseTranslator:
             # Decode
             translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            logger.debug(f"Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
             logger.debug(f"Are original and translated the same? {text.strip() == translated.strip()}")
             # Track success

 class VietnameseTranslator:
     """
+    Vietnamese translator using LLM models (NVIDIA/Gemini) with Opus as fallback.
+    This class handles translation from English to Vietnamese using LLM models
+    for better quality, with Opus model as fallback.
     """
+    def __init__(self, model_name: Optional[str] = None, device: Optional[str] = None, paraphraser=None):
         """
         Initialize the Vietnamese translator.
         Args:
+            model_name: Hugging Face model name for fallback. Defaults to EN_VI env var or Helsinki-NLP/opus-mt-en-vi
+            device: Device to run the fallback model on ('cpu', 'cuda', 'auto'). Defaults to 'auto'
+            paraphraser: Paraphraser instance with LLM models for primary translation
         """
         self.model_name = model_name or os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
         self.device = self._get_device(device)
         self.tokenizer = None
         self._is_loaded = False
         self._stats = {"total_translations": 0, "successful_translations": 0, "failed_translations": 0}
+        self.paraphraser = paraphraser  # LLM-based translator
+        logger.info(f"VietnameseTranslator initialized with LLM models + Opus fallback: {self.model_name}")
         logger.info(f"Using device: {self.device}")
     def _get_device(self, device: Optional[str]) -> str:
     def translate_text(self, text: str) -> str:
         """
+        Translate a single text from English to Vietnamese using LLM models first, Opus as fallback.
         Args:
             text: English text to translate
         Returns:
             Translated Vietnamese text
         """
         if not text or not text.strip():
             return text
         try:
             self._stats["total_translations"] += 1
+            # Try LLM-based translation first (NVIDIA/Gemini)
+            if self.paraphraser:
+                try:
+                    translated = self.paraphraser.translate(text, target_lang="vi")
+                    if translated and translated.strip() and translated.strip() != text.strip():
+                        logger.debug(f"LLM Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
+                        self._stats["successful_translations"] += 1
+                        return translated.strip()
+                    else:
+                        logger.debug("LLM translation failed or returned identical text, trying Opus fallback")
+                except Exception as e:
+                    logger.debug(f"LLM translation failed: {e}, trying Opus fallback")
+            # Fallback to Opus model
+            if not self._is_loaded:
+                self.load_model()
             # Prepare input with target language token
             input_text = f">>vie<< {text.strip()}"
             # Tokenize
             # Decode
             translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            logger.debug(f"Opus Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
             logger.debug(f"Are original and translated the same? {text.strip() == translated.strip()}")
             # Track success