Spaces:
Sleeping
Sleeping
Commit
·
e138b0e
1
Parent(s):
62d99f6
Force translation with Llama
Browse files- app.py +4 -2
- utils/llm.py +15 -6
- vi/processing.py +1 -0
- vi/translator.py +28 -13
app.py
CHANGED
|
@@ -394,10 +394,12 @@ def _run_job(dataset_key: str, params: ProcessParams):
|
|
| 394 |
cache_dir = os.path.abspath("cache/huggingface")
|
| 395 |
os.makedirs(cache_dir, exist_ok=True)
|
| 396 |
os.environ["HF_HOME"] = cache_dir
|
| 397 |
-
|
|
|
|
|
|
|
| 398 |
vietnamese_translator.load_model()
|
| 399 |
translator = vietnamese_translator
|
| 400 |
-
logger.info("✅ Vietnamese translator loaded successfully")
|
| 401 |
except Exception as e:
|
| 402 |
logger.error(f"❌ Failed to load Vietnamese translator: {e}")
|
| 403 |
logger.warning("Continuing without Vietnamese translation...")
|
|
|
|
| 394 |
cache_dir = os.path.abspath("cache/huggingface")
|
| 395 |
os.makedirs(cache_dir, exist_ok=True)
|
| 396 |
os.environ["HF_HOME"] = cache_dir
|
| 397 |
+
|
| 398 |
+
# Pass paraphraser to translator for LLM-based translation
|
| 399 |
+
vietnamese_translator.paraphraser = paraphraser
|
| 400 |
vietnamese_translator.load_model()
|
| 401 |
translator = vietnamese_translator
|
| 402 |
+
logger.info("✅ Vietnamese translator loaded successfully with LLM models")
|
| 403 |
except Exception as e:
|
| 404 |
logger.error(f"❌ Failed to load Vietnamese translator: {e}")
|
| 405 |
logger.warning("Continuing without Vietnamese translation...")
|
utils/llm.py
CHANGED
|
@@ -109,6 +109,7 @@ class NvidiaClient:
|
|
| 109 |
data = r.json()
|
| 110 |
text = data["choices"][0]["message"]["content"]
|
| 111 |
clean = self._clean_resp(text)
|
|
|
|
| 112 |
logger.info(f"[LLM][NVIDIA] out={snip(clean)}")
|
| 113 |
return clean
|
| 114 |
except Exception as e:
|
|
@@ -117,11 +118,13 @@ class NvidiaClient:
|
|
| 117 |
return None
|
| 118 |
|
| 119 |
class Paraphraser:
|
| 120 |
-
"""Prefers NVIDIA (cheap), falls back to Gemini. Also offers translate/backtranslate and a tiny consistency judge."""
|
| 121 |
def __init__(self, nvidia_model: str, gemini_model_easy: str, gemini_model_hard: str):
|
| 122 |
self.nv = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
|
| 123 |
self.gm_easy = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_easy)
|
| 124 |
-
|
|
|
|
|
|
|
| 125 |
|
| 126 |
# Regex-based cleaning resp from quotes
|
| 127 |
def _clean_resp(self, resp: str) -> str:
|
|
@@ -147,11 +150,17 @@ class Paraphraser:
|
|
| 147 |
"Do not fabricate or remove factual claims.\n"
|
| 148 |
"Return ONLY the rewritten text, without any introduction, commentary.\n"+ text
|
| 149 |
)
|
|
|
|
| 150 |
out = self.nv.generate(prompt, temperature=0.1, max_tokens=min(600, max(128, len(text)//2)))
|
| 151 |
-
if out:
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
# ————— Translate & Backtranslate —————
|
| 157 |
def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
|
|
|
|
| 109 |
data = r.json()
|
| 110 |
text = data["choices"][0]["message"]["content"]
|
| 111 |
clean = self._clean_resp(text)
|
| 112 |
+
# Log the output here
|
| 113 |
logger.info(f"[LLM][NVIDIA] out={snip(clean)}")
|
| 114 |
return clean
|
| 115 |
except Exception as e:
|
|
|
|
| 118 |
return None
|
| 119 |
|
| 120 |
class Paraphraser:
|
| 121 |
+
"""Prefers NVIDIA (cheap), falls back to Gemini EASY only. Also offers translate/backtranslate and a tiny consistency judge."""
|
| 122 |
def __init__(self, nvidia_model: str, gemini_model_easy: str, gemini_model_hard: str):
|
| 123 |
self.nv = NvidiaClient(KeyRotator("NVIDIA_API"), nvidia_model)
|
| 124 |
self.gm_easy = GeminiClient(KeyRotator("GEMINI_API"), gemini_model_easy)
|
| 125 |
+
# Only use GEMINI_MODEL_EASY, ignore hard model completely
|
| 126 |
+
self.gm_hard = None # Disabled - only use easy model
|
| 127 |
+
logger.info("Paraphraser initialized: NVIDIA -> GEMINI_EASY (GEMINI_HARD disabled)")
|
| 128 |
|
| 129 |
# Regex-based cleaning resp from quotes
|
| 130 |
def _clean_resp(self, resp: str) -> str:
|
|
|
|
| 150 |
"Do not fabricate or remove factual claims.\n"
|
| 151 |
"Return ONLY the rewritten text, without any introduction, commentary.\n"+ text
|
| 152 |
)
|
| 153 |
+
# Always try NVIDIA first
|
| 154 |
out = self.nv.generate(prompt, temperature=0.1, max_tokens=min(600, max(128, len(text)//2)))
|
| 155 |
+
if out:
|
| 156 |
+
return self._clean_resp(out)
|
| 157 |
+
|
| 158 |
+
# Only fallback to GEMINI_MODEL_EASY (ignore difficulty parameter)
|
| 159 |
+
out = self.gm_easy.generate(prompt, max_output_tokens=min(600, max(128, len(text)//2)))
|
| 160 |
+
if out:
|
| 161 |
+
logger.info(f"[LLM][GEMINI] out={snip(self._clean_resp(out))}")
|
| 162 |
+
return self._clean_resp(out)
|
| 163 |
+
return text
|
| 164 |
|
| 165 |
# ————— Translate & Backtranslate —————
|
| 166 |
def translate(self, text: str, target_lang: str = "vi") -> Optional[str]:
|
vi/processing.py
CHANGED
|
@@ -145,6 +145,7 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
|
|
| 145 |
add_translation_stats(translator._stats, f"sft_{field}", True)
|
| 146 |
continue
|
| 147 |
|
|
|
|
| 148 |
translated = translator.translate_text(original)
|
| 149 |
|
| 150 |
# Debug logging
|
|
|
|
| 145 |
add_translation_stats(translator._stats, f"sft_{field}", True)
|
| 146 |
continue
|
| 147 |
|
| 148 |
+
# Use LLM for Vietnamese translation instead of Opus model
|
| 149 |
translated = translator.translate_text(original)
|
| 150 |
|
| 151 |
# Debug logging
|
vi/translator.py
CHANGED
|
@@ -12,19 +12,20 @@ logger = logging.getLogger(__name__)
|
|
| 12 |
|
| 13 |
class VietnameseTranslator:
|
| 14 |
"""
|
| 15 |
-
Vietnamese translator using
|
| 16 |
|
| 17 |
-
This class handles translation from English to Vietnamese using
|
| 18 |
-
|
| 19 |
"""
|
| 20 |
|
| 21 |
-
def __init__(self, model_name: Optional[str] = None, device: Optional[str] = None):
|
| 22 |
"""
|
| 23 |
Initialize the Vietnamese translator.
|
| 24 |
|
| 25 |
Args:
|
| 26 |
-
model_name: Hugging Face model name. Defaults to EN_VI env var or Helsinki-NLP/opus-mt-en-vi
|
| 27 |
-
device: Device to run the model on ('cpu', 'cuda', 'auto'). Defaults to 'auto'
|
|
|
|
| 28 |
"""
|
| 29 |
self.model_name = model_name or os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
|
| 30 |
self.device = self._get_device(device)
|
|
@@ -32,8 +33,9 @@ class VietnameseTranslator:
|
|
| 32 |
self.tokenizer = None
|
| 33 |
self._is_loaded = False
|
| 34 |
self._stats = {"total_translations": 0, "successful_translations": 0, "failed_translations": 0}
|
|
|
|
| 35 |
|
| 36 |
-
logger.info(f"VietnameseTranslator initialized with
|
| 37 |
logger.info(f"Using device: {self.device}")
|
| 38 |
|
| 39 |
def _get_device(self, device: Optional[str]) -> str:
|
|
@@ -87,7 +89,7 @@ class VietnameseTranslator:
|
|
| 87 |
|
| 88 |
def translate_text(self, text: str) -> str:
|
| 89 |
"""
|
| 90 |
-
Translate a single text from English to Vietnamese.
|
| 91 |
|
| 92 |
Args:
|
| 93 |
text: English text to translate
|
|
@@ -95,17 +97,30 @@ class VietnameseTranslator:
|
|
| 95 |
Returns:
|
| 96 |
Translated Vietnamese text
|
| 97 |
"""
|
| 98 |
-
if not self._is_loaded:
|
| 99 |
-
self.load_model()
|
| 100 |
-
|
| 101 |
if not text or not text.strip():
|
| 102 |
return text
|
| 103 |
|
| 104 |
try:
|
| 105 |
self._stats["total_translations"] += 1
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Prepare input with target language token
|
| 108 |
-
# The model requires a target language token in the format >>id<<
|
| 109 |
input_text = f">>vie<< {text.strip()}"
|
| 110 |
|
| 111 |
# Tokenize
|
|
@@ -130,7 +145,7 @@ class VietnameseTranslator:
|
|
| 130 |
# Decode
|
| 131 |
translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 132 |
|
| 133 |
-
logger.debug(f"Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
|
| 134 |
logger.debug(f"Are original and translated the same? {text.strip() == translated.strip()}")
|
| 135 |
|
| 136 |
# Track success
|
|
|
|
| 12 |
|
| 13 |
class VietnameseTranslator:
|
| 14 |
"""
|
| 15 |
+
Vietnamese translator using LLM models (NVIDIA/Gemini) with Opus as fallback.
|
| 16 |
|
| 17 |
+
This class handles translation from English to Vietnamese using LLM models
|
| 18 |
+
for better quality, with Opus model as fallback.
|
| 19 |
"""
|
| 20 |
|
| 21 |
+
def __init__(self, model_name: Optional[str] = None, device: Optional[str] = None, paraphraser=None):
|
| 22 |
"""
|
| 23 |
Initialize the Vietnamese translator.
|
| 24 |
|
| 25 |
Args:
|
| 26 |
+
model_name: Hugging Face model name for fallback. Defaults to EN_VI env var or Helsinki-NLP/opus-mt-en-vi
|
| 27 |
+
device: Device to run the fallback model on ('cpu', 'cuda', 'auto'). Defaults to 'auto'
|
| 28 |
+
paraphraser: Paraphraser instance with LLM models for primary translation
|
| 29 |
"""
|
| 30 |
self.model_name = model_name or os.getenv("EN_VI", "Helsinki-NLP/opus-mt-en-vi")
|
| 31 |
self.device = self._get_device(device)
|
|
|
|
| 33 |
self.tokenizer = None
|
| 34 |
self._is_loaded = False
|
| 35 |
self._stats = {"total_translations": 0, "successful_translations": 0, "failed_translations": 0}
|
| 36 |
+
self.paraphraser = paraphraser # LLM-based translator
|
| 37 |
|
| 38 |
+
logger.info(f"VietnameseTranslator initialized with LLM models + Opus fallback: {self.model_name}")
|
| 39 |
logger.info(f"Using device: {self.device}")
|
| 40 |
|
| 41 |
def _get_device(self, device: Optional[str]) -> str:
|
|
|
|
| 89 |
|
| 90 |
def translate_text(self, text: str) -> str:
|
| 91 |
"""
|
| 92 |
+
Translate a single text from English to Vietnamese using LLM models first, Opus as fallback.
|
| 93 |
|
| 94 |
Args:
|
| 95 |
text: English text to translate
|
|
|
|
| 97 |
Returns:
|
| 98 |
Translated Vietnamese text
|
| 99 |
"""
|
|
|
|
|
|
|
|
|
|
| 100 |
if not text or not text.strip():
|
| 101 |
return text
|
| 102 |
|
| 103 |
try:
|
| 104 |
self._stats["total_translations"] += 1
|
| 105 |
|
| 106 |
+
# Try LLM-based translation first (NVIDIA/Gemini)
|
| 107 |
+
if self.paraphraser:
|
| 108 |
+
try:
|
| 109 |
+
translated = self.paraphraser.translate(text, target_lang="vi")
|
| 110 |
+
if translated and translated.strip() and translated.strip() != text.strip():
|
| 111 |
+
logger.debug(f"LLM Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
|
| 112 |
+
self._stats["successful_translations"] += 1
|
| 113 |
+
return translated.strip()
|
| 114 |
+
else:
|
| 115 |
+
logger.debug("LLM translation failed or returned identical text, trying Opus fallback")
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.debug(f"LLM translation failed: {e}, trying Opus fallback")
|
| 118 |
+
|
| 119 |
+
# Fallback to Opus model
|
| 120 |
+
if not self._is_loaded:
|
| 121 |
+
self.load_model()
|
| 122 |
+
|
| 123 |
# Prepare input with target language token
|
|
|
|
| 124 |
input_text = f">>vie<< {text.strip()}"
|
| 125 |
|
| 126 |
# Tokenize
|
|
|
|
| 145 |
# Decode
|
| 146 |
translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 147 |
|
| 148 |
+
logger.debug(f"Opus Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
|
| 149 |
logger.debug(f"Are original and translated the same? {text.strip() == translated.strip()}")
|
| 150 |
|
| 151 |
# Track success
|