Spaces:

BinKhoaLe1812
/

Cooking_Tutor

Sleeping

App Files Files Community

Cooking_Tutor / utils /translation.py

LiamKhoaLe

Init commit

21446aa 3 months ago

raw

history blame contribute delete

5.28 kB

	# translation.py
	from transformers import pipeline
	import logging
	import re
	from collections import Counter

	logger = logging.getLogger("translation-agent")
	logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader

	# To use lazy model loader
	vi_en = None
	zh_en = None

	def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str:
	"""Collapse excessive repeated n-grams and repeated phrases with improved logic."""
	if not s:
	return s

	# Collapse repeated spaces/newlines
	s = re.sub(r"\s+", " ", s).strip()

	# More aggressive repetition detection
	# Check for simple word repetition (like "a lot of people do not" repeated)
	words = s.split()
	if len(words) > 20: # Only check if text is long enough
	# Look for repeated sequences of 3-8 words
	for seq_len in range(8, 2, -1):
	if len(words) < seq_len * 3: # Need at least 3 repetitions
	continue

	# Check each possible starting position
	for start in range(len(words) - seq_len * 2):
	sequence = words[start:start + seq_len]
	# Count how many times this sequence repeats
	repeat_count = 1
	pos = start + seq_len
	while pos + seq_len <= len(words):
	if words[pos:pos + seq_len] == sequence:
	repeat_count += 1
	pos += seq_len
	else:
	break

	# If we found 3+ repetitions, remove the excess
	if repeat_count >= 3:
	# Keep only the first occurrence
	new_words = words[:start + seq_len] + words[start + seq_len * repeat_count:]
	s = " ".join(new_words)
	words = s.split()
	break
	else:
	continue
	break # Break outer loop if we found and fixed a repetition

	# Additional cleanup for remaining patterns
	# Remove consecutive identical word
	tokens = s.split()
	out = []
	last = None
	for t in tokens:
	if last is None or t.lower() != last.lower():
	out.append(t)
	last = t
	s = " ".join(out)

	# Limit consecutive duplicate n-grams
	for n in range(n_max, n_min - 1, -1):
	pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE)
	s = pattern.sub(r"\1", s)

	return s


	def _normalize_and_cap(s: str, cap: int = 512) -> str:
	if not s:
	return s
	s = s.strip()
	if len(s) > cap:
	s = s[:cap]
	return s


	def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool:
	if not s:
	return False
	tokens = [t.lower() for t in s.split()]
	if len(tokens) < 10:
	return False
	counts = Counter(tokens)
	top = counts.most_common(1)[0][1]
	return (top / max(1, len(tokens))) >= threshold


	def translate_query(text: str, lang_code: str) -> str:
	global vi_en, zh_en

	if not text or not text.strip():
	return text

	try:
	if lang_code == "vi":
	if vi_en is None:
	logger.info("[Translation] Loading Vietnamese-English model...")
	vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)

	# Limit input length to prevent model issues
	input_text = text[:1000] if len(text) > 1000 else text
	raw = vi_en(input_text, max_length=512)[0]["translation_text"]
	cleaned = _dedupe_repeats(raw)
	norm = _normalize_and_cap(cleaned, cap=512)

	if _is_too_repetitive(norm) or len(norm.strip()) < 10:
	logger.warning("[En-Vi] Translation repetitive or too short; falling back to original text")
	return text

	logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm[:100]}...")
	return norm

	elif lang_code == "zh":
	if zh_en is None:
	logger.info("[Translation] Loading Chinese-English model...")
	zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)

	# Limit input length to prevent model issues
	input_text = text[:1000] if len(text) > 1000 else text
	raw = zh_en(input_text, max_length=512)[0]["translation_text"]
	cleaned = _dedupe_repeats(raw)
	norm = _normalize_and_cap(cleaned, cap=512)

	if _is_too_repetitive(norm) or len(norm.strip()) < 10:
	logger.warning("[En-Zh] Translation repetitive or too short; falling back to original text")
	return text

	logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm[:100]}...")
	return norm

	except Exception as e:
	logger.error(f"[Translation] Translation failed for {lang_code}: {e}")
	return text # Fallback to original text

	return text