Spaces:
Sleeping
Sleeping
| # translation.py | |
| from transformers import pipeline | |
| import logging | |
| import re | |
| from collections import Counter | |
| logger = logging.getLogger("translation-agent") | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader | |
| # To use lazy model loader | |
| vi_en = None | |
| zh_en = None | |
| def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str: | |
| """Collapse excessive repeated n-grams and repeated phrases with improved logic.""" | |
| if not s: | |
| return s | |
| # Collapse repeated spaces/newlines | |
| s = re.sub(r"\s+", " ", s).strip() | |
| # More aggressive repetition detection | |
| # Check for simple word repetition (like "a lot of people do not" repeated) | |
| words = s.split() | |
| if len(words) > 20: # Only check if text is long enough | |
| # Look for repeated sequences of 3-8 words | |
| for seq_len in range(8, 2, -1): | |
| if len(words) < seq_len * 3: # Need at least 3 repetitions | |
| continue | |
| # Check each possible starting position | |
| for start in range(len(words) - seq_len * 2): | |
| sequence = words[start:start + seq_len] | |
| # Count how many times this sequence repeats | |
| repeat_count = 1 | |
| pos = start + seq_len | |
| while pos + seq_len <= len(words): | |
| if words[pos:pos + seq_len] == sequence: | |
| repeat_count += 1 | |
| pos += seq_len | |
| else: | |
| break | |
| # If we found 3+ repetitions, remove the excess | |
| if repeat_count >= 3: | |
| # Keep only the first occurrence | |
| new_words = words[:start + seq_len] + words[start + seq_len * repeat_count:] | |
| s = " ".join(new_words) | |
| words = s.split() | |
| break | |
| else: | |
| continue | |
| break # Break outer loop if we found and fixed a repetition | |
| # Additional cleanup for remaining patterns | |
| # Remove consecutive identical word | |
| tokens = s.split() | |
| out = [] | |
| last = None | |
| for t in tokens: | |
| if last is None or t.lower() != last.lower(): | |
| out.append(t) | |
| last = t | |
| s = " ".join(out) | |
| # Limit consecutive duplicate n-grams | |
| for n in range(n_max, n_min - 1, -1): | |
| pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE) | |
| s = pattern.sub(r"\1", s) | |
| return s | |
| def _normalize_and_cap(s: str, cap: int = 512) -> str: | |
| if not s: | |
| return s | |
| s = s.strip() | |
| if len(s) > cap: | |
| s = s[:cap] | |
| return s | |
| def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool: | |
| if not s: | |
| return False | |
| tokens = [t.lower() for t in s.split()] | |
| if len(tokens) < 10: | |
| return False | |
| counts = Counter(tokens) | |
| top = counts.most_common(1)[0][1] | |
| return (top / max(1, len(tokens))) >= threshold | |
| def translate_query(text: str, lang_code: str) -> str: | |
| global vi_en, zh_en | |
| if not text or not text.strip(): | |
| return text | |
| try: | |
| if lang_code == "vi": | |
| if vi_en is None: | |
| logger.info("[Translation] Loading Vietnamese-English model...") | |
| vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1) | |
| # Limit input length to prevent model issues | |
| input_text = text[:1000] if len(text) > 1000 else text | |
| raw = vi_en(input_text, max_length=512)[0]["translation_text"] | |
| cleaned = _dedupe_repeats(raw) | |
| norm = _normalize_and_cap(cleaned, cap=512) | |
| if _is_too_repetitive(norm) or len(norm.strip()) < 10: | |
| logger.warning("[En-Vi] Translation repetitive or too short; falling back to original text") | |
| return text | |
| logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm[:100]}...") | |
| return norm | |
| elif lang_code == "zh": | |
| if zh_en is None: | |
| logger.info("[Translation] Loading Chinese-English model...") | |
| zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1) | |
| # Limit input length to prevent model issues | |
| input_text = text[:1000] if len(text) > 1000 else text | |
| raw = zh_en(input_text, max_length=512)[0]["translation_text"] | |
| cleaned = _dedupe_repeats(raw) | |
| norm = _normalize_and_cap(cleaned, cap=512) | |
| if _is_too_repetitive(norm) or len(norm.strip()) < 10: | |
| logger.warning("[En-Zh] Translation repetitive or too short; falling back to original text") | |
| return text | |
| logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm[:100]}...") | |
| return norm | |
| except Exception as e: | |
| logger.error(f"[Translation] Translation failed for {lang_code}: {e}") | |
| return text # Fallback to original text | |
| return text | |