processing / vi /processing.py
LiamKhoaLe's picture
Add Vietnamese translation functionality with Helsinki-NLP/opus-mt-en-vi model
95a3120
raw
history blame
3.16 kB
"""
Processing utilities for Vietnamese translation integration
"""
import logging
from typing import Dict, Any, List, Optional, Callable
logger = logging.getLogger(__name__)
def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
"""
Translate specific text fields in an SFT row from English to Vietnamese.
Args:
row: SFT row dictionary
translator: VietnameseTranslator instance
text_fields: List of field names to translate. If None, uses default fields.
Returns:
Translated SFT row dictionary
"""
if not translator or not translator.is_loaded():
logger.warning("Translator not available, skipping translation")
return row
if text_fields is None:
# Default fields to translate in SFT format
text_fields = ["instruction", "input", "output"]
try:
translated_row = translator.translate_dict(row, text_fields)
logger.debug(f"Translated SFT row with fields: {text_fields}")
return translated_row
except Exception as e:
logger.error(f"Failed to translate SFT row: {e}")
return row
def translate_rag_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]:
"""
Translate specific text fields in a RAG row from English to Vietnamese.
Args:
row: RAG row dictionary
translator: VietnameseTranslator instance
text_fields: List of field names to translate. If None, uses default fields.
Returns:
Translated RAG row dictionary
"""
if not translator or not translator.is_loaded():
logger.warning("Translator not available, skipping translation")
return row
if text_fields is None:
# Default fields to translate in RAG format
text_fields = ["instruction", "input", "output"]
try:
translated_row = translator.translate_dict(row, text_fields)
logger.debug(f"Translated RAG row with fields: {text_fields}")
return translated_row
except Exception as e:
logger.error(f"Failed to translate RAG row: {e}")
return row
def should_translate(vietnamese_translation: bool, translator) -> bool:
"""
Check if translation should be performed.
Args:
vietnamese_translation: Flag from user input
translator: VietnameseTranslator instance
Returns:
True if translation should be performed
"""
if not vietnamese_translation:
return False
if not translator or not translator.is_loaded():
logger.warning("Vietnamese translation requested but translator not available")
return False
return True
def log_translation_stats(stats: Dict[str, Any], translated_count: int) -> None:
"""
Log translation statistics.
Args:
stats: Statistics dictionary to update
translated_count: Number of items translated
"""
stats["vietnamese_translated"] = translated_count
logger.info(f"Vietnamese translation completed: {translated_count} items translated")