Cooking_Tutor / utils /translation.py
LiamKhoaLe's picture
Init commit
21446aa
# translation.py
from transformers import pipeline
import logging
import re
from collections import Counter
logger = logging.getLogger("translation-agent")
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
# To use lazy model loader
vi_en = None
zh_en = None
def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str:
"""Collapse excessive repeated n-grams and repeated phrases with improved logic."""
if not s:
return s
# Collapse repeated spaces/newlines
s = re.sub(r"\s+", " ", s).strip()
# More aggressive repetition detection
# Check for simple word repetition (like "a lot of people do not" repeated)
words = s.split()
if len(words) > 20: # Only check if text is long enough
# Look for repeated sequences of 3-8 words
for seq_len in range(8, 2, -1):
if len(words) < seq_len * 3: # Need at least 3 repetitions
continue
# Check each possible starting position
for start in range(len(words) - seq_len * 2):
sequence = words[start:start + seq_len]
# Count how many times this sequence repeats
repeat_count = 1
pos = start + seq_len
while pos + seq_len <= len(words):
if words[pos:pos + seq_len] == sequence:
repeat_count += 1
pos += seq_len
else:
break
# If we found 3+ repetitions, remove the excess
if repeat_count >= 3:
# Keep only the first occurrence
new_words = words[:start + seq_len] + words[start + seq_len * repeat_count:]
s = " ".join(new_words)
words = s.split()
break
else:
continue
break # Break outer loop if we found and fixed a repetition
# Additional cleanup for remaining patterns
# Remove consecutive identical word
tokens = s.split()
out = []
last = None
for t in tokens:
if last is None or t.lower() != last.lower():
out.append(t)
last = t
s = " ".join(out)
# Limit consecutive duplicate n-grams
for n in range(n_max, n_min - 1, -1):
pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE)
s = pattern.sub(r"\1", s)
return s
def _normalize_and_cap(s: str, cap: int = 512) -> str:
if not s:
return s
s = s.strip()
if len(s) > cap:
s = s[:cap]
return s
def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool:
if not s:
return False
tokens = [t.lower() for t in s.split()]
if len(tokens) < 10:
return False
counts = Counter(tokens)
top = counts.most_common(1)[0][1]
return (top / max(1, len(tokens))) >= threshold
def translate_query(text: str, lang_code: str) -> str:
global vi_en, zh_en
if not text or not text.strip():
return text
try:
if lang_code == "vi":
if vi_en is None:
logger.info("[Translation] Loading Vietnamese-English model...")
vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
# Limit input length to prevent model issues
input_text = text[:1000] if len(text) > 1000 else text
raw = vi_en(input_text, max_length=512)[0]["translation_text"]
cleaned = _dedupe_repeats(raw)
norm = _normalize_and_cap(cleaned, cap=512)
if _is_too_repetitive(norm) or len(norm.strip()) < 10:
logger.warning("[En-Vi] Translation repetitive or too short; falling back to original text")
return text
logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm[:100]}...")
return norm
elif lang_code == "zh":
if zh_en is None:
logger.info("[Translation] Loading Chinese-English model...")
zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
# Limit input length to prevent model issues
input_text = text[:1000] if len(text) > 1000 else text
raw = zh_en(input_text, max_length=512)[0]["translation_text"]
cleaned = _dedupe_repeats(raw)
norm = _normalize_and_cap(cleaned, cap=512)
if _is_too_repetitive(norm) or len(norm.strip()) < 10:
logger.warning("[En-Zh] Translation repetitive or too short; falling back to original text")
return text
logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm[:100]}...")
return norm
except Exception as e:
logger.error(f"[Translation] Translation failed for {lang_code}: {e}")
return text # Fallback to original text
return text