Spaces:
Sleeping
Sleeping
| """ | |
| Processing utilities for Vietnamese translation integration | |
| """ | |
| import logging | |
| from typing import Dict, Any, List, Optional, Callable | |
| logger = logging.getLogger(__name__) | |
| def _vi_sanitize_text(s: str) -> str: | |
| """Light Vietnamese sanitization for finetuning and RAG: strip extra spaces, limit repetition, preserve numbers/units.""" | |
| if not isinstance(s, str): | |
| return s | |
| t = s.strip() | |
| # Collapse repeated punctuation and whitespace | |
| import re | |
| t = re.sub(r"\s+", " ", t) | |
| t = re.sub(r"([.?!]){3,}", r"..", t) | |
| # Remove obvious repetition chunks (very heuristic) | |
| parts = t.split() | |
| if len(parts) > 20: | |
| window = 6 | |
| seen = set() | |
| filtered = [] | |
| for i in range(len(parts)): | |
| ngram = " ".join(parts[max(0, i-window):i+1]) | |
| if ngram in seen: | |
| continue | |
| seen.add(ngram) | |
| filtered.append(parts[i]) | |
| t = " ".join(filtered) | |
| return t | |
| def _validate_vi_translation(original: str, translated: str) -> bool: | |
| """Validate Vietnamese translation quality""" | |
| if not translated or not isinstance(translated, str): | |
| return False | |
| # Check if translation is too short or too different in length | |
| if len(translated.strip()) < 3: | |
| return False | |
| # Check if translation contains too much English (should be mostly Vietnamese) | |
| import re | |
| english_chars = len(re.findall(r'[a-zA-Z]', translated)) | |
| total_chars = len(re.sub(r'\s', '', translated)) | |
| if total_chars > 0 and english_chars / total_chars > 0.7: | |
| return False | |
| # Check for common translation failure patterns | |
| failure_patterns = [ | |
| "translation", "error", "failed", "unable", "cannot", | |
| "not available", "not found", "invalid", "error" | |
| ] | |
| translated_lower = translated.lower() | |
| for pattern in failure_patterns: | |
| if pattern in translated_lower: | |
| return False | |
| return True | |
| def translate_sft_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]: | |
| """ | |
| Translate specific text fields in an SFT row from English to Vietnamese. | |
| Args: | |
| row: SFT row dictionary | |
| translator: VietnameseTranslator instance | |
| text_fields: List of field names to translate. If None, uses default fields. | |
| Returns: | |
| Translated SFT row dictionary | |
| """ | |
| if not translator or not translator.is_loaded(): | |
| logger.warning("Translator not available, skipping translation") | |
| return row | |
| if text_fields is None: | |
| # Default fields to translate in SFT format | |
| text_fields = ["instruction", "input", "output"] | |
| try: | |
| translated_row = translator.translate_dict(row, text_fields) | |
| # Validate and sanitize translated fields | |
| for f in text_fields: | |
| if f in translated_row.get("sft", {}): | |
| original = row.get("sft", {}).get(f, "") | |
| translated = translated_row["sft"][f] | |
| if _validate_vi_translation(original, translated): | |
| translated_row["sft"][f] = _vi_sanitize_text(translated) | |
| else: | |
| logger.warning(f"Invalid Vietnamese translation for field {f}, keeping original") | |
| translated_row["sft"][f] = original | |
| logger.debug(f"Translated SFT row with fields: {text_fields}") | |
| return translated_row | |
| except Exception as e: | |
| logger.error(f"Failed to translate SFT row: {e}") | |
| return row | |
| def translate_rag_row(row: Dict[str, Any], translator, text_fields: List[str] = None) -> Dict[str, Any]: | |
| """ | |
| Translate specific text fields in a RAG row from English to Vietnamese. | |
| Args: | |
| row: RAG row dictionary | |
| translator: VietnameseTranslator instance | |
| text_fields: List of field names to translate. If None, uses default fields. | |
| Returns: | |
| Translated RAG row dictionary | |
| """ | |
| if not translator or not translator.is_loaded(): | |
| logger.warning("Translator not available, skipping translation") | |
| return row | |
| if text_fields is None: | |
| # Default fields to translate in RAG format (Q, A, C) | |
| text_fields = ["question", "answer", "context"] | |
| try: | |
| translated_row = translator.translate_dict(row, text_fields) | |
| # Validate and sanitize translated fields | |
| for f in text_fields: | |
| if f in translated_row: | |
| original = row.get(f, "") | |
| translated = translated_row[f] | |
| if _validate_vi_translation(original, translated): | |
| translated_row[f] = _vi_sanitize_text(translated) | |
| else: | |
| logger.warning(f"Invalid Vietnamese translation for field {f}, keeping original") | |
| translated_row[f] = original | |
| logger.debug(f"Translated RAG row with fields: {text_fields}") | |
| return translated_row | |
| except Exception as e: | |
| logger.error(f"Failed to translate RAG row: {e}") | |
| return row | |
| def should_translate(vietnamese_translation: bool, translator) -> bool: | |
| """ | |
| Check if translation should be performed. | |
| Args: | |
| vietnamese_translation: Flag from user input | |
| translator: VietnameseTranslator instance | |
| Returns: | |
| True if translation should be performed | |
| """ | |
| if not vietnamese_translation: | |
| return False | |
| if not translator or not translator.is_loaded(): | |
| logger.warning("Vietnamese translation requested but translator not available") | |
| return False | |
| return True | |
| def log_translation_stats(stats: Dict[str, Any], translated_count: int) -> None: | |
| """ | |
| Log translation statistics. | |
| Args: | |
| stats: Statistics dictionary to update | |
| translated_count: Number of items translated | |
| """ | |
| stats["vietnamese_translated"] = translated_count | |
| logger.info(f"Vietnamese translation completed: {translated_count} items translated") |