Spaces:
Sleeping
Sleeping
Commit
·
e2fd976
1
Parent(s):
4056c2c
Upd valid output SFT saver
Browse files- vi/processing.py +10 -2
- vi/translator.py +3 -1
vi/processing.py
CHANGED
|
@@ -104,12 +104,20 @@ def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] =
|
|
| 104 |
original = sft_data[field]
|
| 105 |
translated = translator.translate_text(original)
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Validate and sanitize translated field
|
| 108 |
if _validate_vi_translation(original, translated):
|
| 109 |
translated_sft[field] = _vi_sanitize_text(translated)
|
| 110 |
-
logger.debug(f"
|
| 111 |
else:
|
| 112 |
-
logger.warning(f"Invalid Vietnamese translation for field {field}, keeping original")
|
|
|
|
|
|
|
| 113 |
translated_sft[field] = original
|
| 114 |
except Exception as e:
|
| 115 |
logger.error(f"Failed to translate field '{field}': {e}")
|
|
|
|
| 104 |
original = sft_data[field]
|
| 105 |
translated = translator.translate_text(original)
|
| 106 |
|
| 107 |
+
# Debug logging
|
| 108 |
+
logger.debug(f"Translation attempt for field '{field}':")
|
| 109 |
+
logger.debug(f" Original: '{original[:50]}...'")
|
| 110 |
+
logger.debug(f" Translated: '{translated[:50]}...'")
|
| 111 |
+
logger.debug(f" Are they the same? {original == translated}")
|
| 112 |
+
|
| 113 |
# Validate and sanitize translated field
|
| 114 |
if _validate_vi_translation(original, translated):
|
| 115 |
translated_sft[field] = _vi_sanitize_text(translated)
|
| 116 |
+
logger.debug(f"✅ Successfully translated field '{field}'")
|
| 117 |
else:
|
| 118 |
+
logger.warning(f"❌ Invalid Vietnamese translation for field {field}, keeping original")
|
| 119 |
+
logger.warning(f" Original: '{original[:50]}...'")
|
| 120 |
+
logger.warning(f" Translated: '{translated[:50]}...'")
|
| 121 |
translated_sft[field] = original
|
| 122 |
except Exception as e:
|
| 123 |
logger.error(f"Failed to translate field '{field}': {e}")
|
vi/translator.py
CHANGED
|
@@ -127,7 +127,9 @@ class VietnameseTranslator:
|
|
| 127 |
# Decode
|
| 128 |
translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 129 |
|
| 130 |
-
logger.debug(f"
|
|
|
|
|
|
|
| 131 |
return translated.strip()
|
| 132 |
|
| 133 |
except Exception as e:
|
|
|
|
| 127 |
# Decode
|
| 128 |
translated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 129 |
|
| 130 |
+
logger.debug(f"Translation result: '{text[:50]}...' -> '{translated[:50]}...'")
|
| 131 |
+
logger.debug(f"Are original and translated the same? {text.strip() == translated.strip()}")
|
| 132 |
+
|
| 133 |
return translated.strip()
|
| 134 |
|
| 135 |
except Exception as e:
|