Spaces:

Cong123779
/

tienhiep-api

Running

tienhiep-api / backend /engine /engine.py

Cong123779

deploy: update backend production to new Space

d9bfc2d 1 day ago

39.3 kB

	import os
	import sys
	import re
	import jieba.posseg as pseg

	from backend.config import Config

	NUM_RE = re.compile(r'^[0-9一二三四五六七八九十百千万几数多半两]+$')
	PUNCT_SET = {',', '.', '!', '?', ';', ':', '"', '\'', '(', ')', '[', ']', '{', '}',
	'，', '。', '！', '？', '；', '：', '“', '”', '‘', '’', '（', '）', '【', '】', '《', '》', '、', '—', '～'}


	class SimpleToken:
	def __init__(self, word, tag):
	self.word = word
	self.tag = tag
	self.translated = None

	@property
	def flag(self):
	return self.tag

	@flag.setter
	def flag(self, value):
	self.tag = value

	class VietphraseEngine:
	def __init__(self, config=None):
	self.config = config or {}
	self.load_dictionaries()

	# Check translation mode
	self.translation_mode = self.config.get("translation", {}).get("mode", "advanced")
	if self.config.get("translation", {}).get("fast_mode", False):
	self.translation_mode = "fast"

	# Warm-up jieba
	import jieba
	try:
	# Fix segmenter splitting overlapping words like 重生于
	jieba.add_word("重生", tag="v")
	jieba.suggest_freq(("生", "于"), True)
	jieba.suggest_freq(("着", "重"), True)
	jieba.suggest_freq(("醉", "人"), True)
	# Tag grades as nouns instead of proper names (nr)
	jieba.add_word("高一", tag="n")
	jieba.add_word("高二", tag="n")
	jieba.add_word("高三", tag="n")
	except Exception as e:
	print("Error initializing custom word splits in Jieba:", e)

	# Always initialize both tokenizers to support dynamic mode switching
	self.jieba_tokenizer = jieba.dt
	self.pseg_dict = pseg.dt.word_tag_tab
	list(self.jieba_tokenizer.cut("暖洋洋"))
	list(pseg.cut("暖洋洋"))

	def load_dictionaries(self):
	paths = self.config.get("paths", {}).get("dictionaries", {})
	vp_path = paths.get("vietphrase", "")
	if not vp_path or not os.path.isabs(vp_path):
	vp_path = os.path.join(Config.ROOT_DIR, vp_path or "dictionaries/Vietphrase.txt")

	dict_dir = os.path.dirname(vp_path)

	# Check for encrypted .bin dictionaries first, then fallback to .txt
	def load_file_content(base_name):
	bin_file = os.path.join(dict_dir, base_name + ".bin")
	txt_file = os.path.join(dict_dir, base_name + ".txt")

	if os.path.exists(bin_file):
	# Decrypt XOR
	with open(bin_file, "rb") as f:
	data = f.read()
	key_bytes = "quick_translator_secret_key_2026".encode("utf-8")
	key_len = len(key_bytes)
	repeated_key = (key_bytes * (len(data) // key_len + 1))[:len(data)]
	decrypted = bytes(a ^ b for a, b in zip(data, repeated_key))
	return decrypted.decode("utf-8")
	elif os.path.exists(txt_file):
	with open(txt_file, "r", encoding="utf-8") as f:
	return f.read()
	return ""

	print("Loading dictionaries in VietphraseEngine...")
	self.char_dict = self.parse_dict_content(load_file_content("HanViet_CharDict"))

	# --- ADD HÁn Nôm FALLBACK ---
	import csv
	han_csv_path = os.path.join(dict_dir, "han_all_readings.csv")
	if os.path.exists(han_csv_path):
	try:
	with open(han_csv_path, 'r', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	for row in reader:
	char = row.get("Ký_tự", "").strip()
	hv = row.get("Hán_Việt", "").strip()
	if char and hv and char not in self.char_dict:
	self.char_dict[char] = hv.replace("~", "")
	print("Loaded han_all_readings.csv as fallback for missing Chinese characters.")
	except Exception as e:
	print("Could not load han_all_readings.csv:", e)

	self.proper_names = self.parse_dict_content(load_file_content("Aligned_HanViet"), convert_to_simplified=True)

	vp_content = load_file_content("Vietphrase")
	self.vietphrase = self.parse_vietphrase_content(vp_content)
	print("Dictionaries loaded successfully.")

	# Build Tries for vietphrase and hanviet modes
	from .trie import Trie
	print("Building Tries for fast translation modes...")
	self.vietphrase_trie = Trie()
	# Insert proper names (priority 1)
	for k, v in self.proper_names.items():
	self.vietphrase_trie.insert(k, v, 1)
	# Insert Vietphrase (priority 2 - higher)
	for k, v in self.vietphrase.items():
	self.vietphrase_trie.insert(k, v, 2)

	self.hanviet_trie = Trie()
	# Insert proper names (priority 2)
	for k, v in self.proper_names.items():
	self.hanviet_trie.insert(k, v, 2)
	print("Tries built successfully.")

	# Register proper names in Jieba dictionary for fast modes
	import jieba
	for name in self.proper_names:
	jieba.add_word(name)

	def parse_dict_content(self, content, convert_to_simplified=False):
	dictionary = {}
	if content:
	to_simplified = lambda s: s
	if convert_to_simplified:
	try:
	from hanziconv import HanziConv
	to_simplified = HanziConv.toSimplified
	except ImportError:
	pass

	for line in content.splitlines():
	line = line.strip()
	if not line or "=" not in line or line.startswith('#'):
	continue
	parts = line.split("=", 1)
	key = parts[0].strip()
	val = self.clean_annotation(parts[1].strip())
	dictionary[to_simplified(key)] = val
	return dictionary

	def parse_vietphrase_content(self, content):
	dictionary = {}
	if content:
	for line in content.splitlines():
	line = line.strip()
	if not line or "=" not in line or line.startswith('#'):
	continue
	parts = line.split("=", 1)
	left = parts[0].strip()
	right = self.clean_annotation(parts[1].strip())

	if "," in left and "," in right:
	keys = [k.strip() for k in left.split(",") if k.strip()]
	vals = [v.strip() for v in right.split(",") if v.strip()]
	if len(keys) == len(vals):
	for k, v in zip(keys, vals):
	dictionary[k] = v
	continue
	if left:
	dictionary[left] = right
	return dictionary

	def is_number(self, word):
	return bool(re.match(r'^[0-9一二三四五六七八九十百千万几数多半两]+$', word))

	def capitalize_phrase(self, phrase):
	chars = 'a-zA-ZàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđĐ'
	pattern = f'[{chars}]+'
	return re.sub(pattern, lambda m: m.group(0).capitalize(), phrase)

	def clean_annotation(self, text, mode='vietphrase'):
	if not text:
	return ""
	# 1. Parse curly braces {meaning:reading}
	def repl_curly(match):
	content = match.group(1)
	if ':' in content:
	parts = content.split(':', 1)
	return parts[0].strip() if mode == 'vietphrase' else parts[1].strip()
	return content.strip()

	text = re.sub(r'\{([^{}]+)\}', repl_curly, text)

	# 2. Strip (*...) annotations
	text = re.sub(r'\s$\[^)]*$', '', text)

	return text.strip()

	def format_translation(self, raw_value, multi_option, word=None, prefer_hanviet=False):
	if not raw_value:
	return ""
	options = [o for o in raw_value.split("/") if o.strip()]

	# Deduplicate options while preserving order
	seen = set()
	deduped = []
	for o in options:
	if o not in seen:
	seen.add(o)
	deduped.append(o)

	if not deduped:
	return ""

	if multi_option and len(deduped) > 1:
	return f"{deduped[0]}[{'/'.join(deduped[1:])}]"

	# If multi-option is False, we have a word of length >= 2, and prefer_hanviet is True, prefer Hán Việt alignment
	if prefer_hanviet and word and len(word) >= 2 and len(deduped) > 1:
	hv_sets = []
	for char in word:
	readings = set()
	if char in self.char_dict:
	for r in self.char_dict[char].split('/'):
	r_clean = r.strip().lower()
	if r_clean:
	readings.add(r_clean)
	if readings:
	hv_sets.append(readings)

	best_option = deduped[0]
	best_score = -1

	for opt in deduped:
	opt_syllables = [w.strip().lower() for w in opt.split() if w.strip()]
	score = 0
	for r_set in hv_sets:
	if any(r in opt_syllables for r in r_set):
	score += 1
	if score > best_score:
	best_score = score
	best_option = opt
	if best_score > 0:
	return best_option

	return deduped[0]

	def clean_punctuation_spacing(self, text):
	if not text:
	return text

	# 1. Ensure exactly one space after commas, semicolons, colons, periods, question marks, and exclamation marks.
	# Avoid inserting space if the next character is a closing bracket, closing quote, space, or another punctuation.
	text = re.sub(r'([,;.:!?])(?=[^\s)\]}』】”"’])', r'\1 ', text)

	# 2. Remove any accidental whitespace before these punctuation marks
	text = re.sub(r'\s+([,;.:!?])', r'\1', text)

	# 3. Clean spaces inside parentheses, brackets, and curly/double brackets (including Chinese quote styles)
	text = re.sub(r'([(\[{『【«])\s+', r'\1', text)
	text = re.sub(r'\s+([)\]}』】»])', r'\1', text)

	# Ensure a space exists before opening brackets and after closing brackets when they border words/digits
	text = re.sub(r'(?<=[^\s(\[{『【«])([(\[{『【«])', r' \1', text)
	text = re.sub(r'([)\]}』】»])(?=[^\s.,;:!?)\]}』】»])', r'\1 ', text)

	# 4. Standardize dashes/hyphens used as separators (e.g. "Artist - Song") to have one space on each side
	text = re.sub(r'\s-\s', ' - ', text)

	# 5. Clean up any duplicated/trailing whitespaces
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	def translate_sentence(self, sentence, multi_option=False, mode=None):
	if not sentence or sentence.isspace():
	return ""

	# If the sentence doesn't contain any Chinese characters or symbols, preserve it as-is
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', sentence):
	return sentence

	# Segment into Chinese text blocks and non-Chinese text blocks
	# Keep Chinese characters and Chinese specific punctuations in the translation segment
	chinese_pattern = re.compile(r'([\u4e00-\u9fff\u3000-\u303f\uff00-\uffef]+)')
	parts = chinese_pattern.split(sentence)

	# Merge simple alphanumeric non-Chinese blocks into adjacent Chinese blocks
	i = 1
	while i < len(parts) - 1:
	non_chinese = parts[i+1]
	if re.match(r'^\s[a-zA-Z0-9]+\s$', non_chinese):
	parts[i] = parts[i] + non_chinese + parts[i+2]
	parts.pop(i+1)
	parts.pop(i+1)
	else:
	i += 2

	translated_parts = []
	capitalize_next = True

	for part in parts:
	if not part:
	continue
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', part):
	# Non-Chinese segment -> preserve exactly
	translated_parts.append(part)
	# Check if it ends with sentence terminator
	if re.search(r'[.!?]\s*$', part):
	capitalize_next = True
	elif part.strip():
	capitalize_next = False
	else:
	# Chinese segment -> translate
	trans = self._translate_pure_chinese_sentence(part, multi_option, mode, capitalize_first=capitalize_next)
	translated_parts.append(trans)
	# Check if it ends with sentence terminator
	if re.search(r'[.!?]\s$', part) or re.search(r'[.!?]\s$', trans):
	capitalize_next = True
	else:
	capitalize_next = False

	return "".join(translated_parts)

	def _translate_pure_chinese_sentence(self, sentence, multi_option=False, mode=None, capitalize_first=True):
	if not sentence or sentence.isspace():
	return ""

	active_mode = mode or self.translation_mode

	# Tokenization & Tagging depending on mode
	if active_mode in ("advanced", "advanced_hanviet"):
	raw_tokens = [SimpleToken(t.word, t.flag) for t in pseg.cut(sentence)]
	else:
	# "fast", "vietphrase", "hanviet" modes use the fast tokenizer
	words = list(self.jieba_tokenizer.cut(sentence))
	raw_tokens = []
	for w in words:
	if w in PUNCT_SET:
	tag = 'x'
	elif NUM_RE.match(w):
	tag = 'm'
	else:
	tag = self.pseg_dict.get(w, 'n')
	raw_tokens.append(SimpleToken(w, tag))

	if not raw_tokens:
	return ""

	NUM_KEYWORDS = {"重", "阶", "品", "级", "层", "剑", "星", "转", "天", "色", "关", "重天"}
	HANVIET_NUMBERS = {
	'0': 'Không', '1': 'Nhất', '2': 'Nhị', '3': 'Tam', '4': 'Tứ', '5': 'Ngũ', '6': 'Lục', '7': 'Thất', '8': 'Bát', '9': 'Cửu', '10': 'Thập',
	'一': 'Nhất', '二': 'Nhị', '三': 'Tam', '四': 'Tứ', '五': 'Ngũ', '六': 'Lục', '七': 'Thất', '八': 'Bát', '九': 'Cửu', '十': 'Thập',
	'百': 'Bách', '千': 'Thiên', '万': 'Vạn', '萬': 'Vạn', '几': 'Vài', '数': 'Số', '多': 'Đa', '半': 'Bán', '两': 'Lưỡng', '兩': 'Lưỡng'
	}
	# Helper function to translate a single token
	def translate_single_token(idx, tok, list_of_tokens):
	word = tok.word
	tag = tok.tag

	# Punctuation
	is_punct = (tag == 'x' or word in {',', '.', '!', '?', ';', ':', '"', '(', ')', '[', ']', '{', '}'})
	if is_punct:
	has_chinese = False
	for char in word:
	if char in self.char_dict:
	has_chinese = True
	break
	if not has_chinese:
	punct_map = {
	'，': ',', '。': '.', '「': '"', '」': '"', '、': ',', '？': '?', '！': '!',
	'：': ':', '；': ';', '“': '"', '”': '"', '（': '(', '）': ')'
	}
	tok.translated = punct_map.get(word, word)
	return

	# Rule for number + 人 (e.g. 几十人, 三人)
	if len(word) > 1 and word.endswith('人') and self.is_number(word[:-1]):
	num_part = word[:-1]
	if num_part in self.vietphrase:
	num_trans = self.format_translation(self.vietphrase[num_part], multi_option, num_part)
	else:
	num_trans = " ".join([self.char_dict.get(c, c).split("/")[0] for c in num_part])
	tok.translated = f"{num_trans} người"
	return

	# Special rule for 了 (le vs liao)
	if word == 'l' or word == '了':
	is_at_end = True
	for next_tok in list_of_tokens[idx+1:]:
	if next_tok.word in {'"', '\'', '(', ')', '[', ']', '{', '}', '“', '”', '‘', '’', '（', '）', '【', '】', '《', '》'}:
	continue
	if next_tok.word in {',', '.', '!', '?', ';', ':', '，', '。', '！', '？', '；', '：', '、'}:
	is_at_end = True
	break
	is_at_end = False
	break
	if is_at_end:
	tok.translated = "rồi"
	else:
	tok.translated = "được"
	return

	# Cultivation Realm (cultivation)
	if tag == 'cultivation':
	result = []
	for char in word:
	if char in HANVIET_NUMBERS:
	result.append(HANVIET_NUMBERS[char])
	else:
	cap_val = self.char_dict.get(char, char).split("/")[0].capitalize()
	result.append(cap_val)
	tok.translated = " ".join(result)
	return

	# Determine if it's a noun or an adjective
	is_proper = (tag in {'nr', 'ns', 'nt'} if tag else False)
	is_noun = (tag.startswith('n') if tag else False) or tag in {'n', 'nz', 'ng'} if tag else False
	is_adj = tag in {'a', 'b', 'ad', 'an', 'z'} if tag else False
	is_noun_or_adj = is_proper or is_noun or is_adj

	# --- Chốt chặn cuối cùng cho Tên riêng (Proper Names Guard) ---
	if is_proper:
	if word in self.proper_names:
	tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
	else:
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
	tok.translated = word
	else:
	result = []
	for char in word:
	val = self.char_dict.get(char, char).split("/")[0]
	result.append(val)
	tok.translated = " ".join(result)
	if tok.translated:
	tok.translated = self.capitalize_phrase(tok.translated)

	# --- Translate lookup strategy depending on active_mode (for non-proper names) ---
	else:
	if active_mode == 'hanviet':
	# Mode 4: Pure Hán Việt (NO Vietphrase)
	if word in self.proper_names:
	tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
	else:
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
	tok.translated = word
	else:
	result = []
	for char in word:
	val = self.char_dict.get(char, char).split("/")[0]
	result.append(val)
	tok.translated = " ".join(result)

	elif active_mode == 'vietphrase':
	# Mode 3: Prioritize Vietphrase (Traditional)
	if word in self.vietphrase:
	tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False)
	elif word in self.proper_names:
	tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
	else:
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
	tok.translated = word
	else:
	result = []
	for char in word:
	val = self.char_dict.get(char, char).split("/")[0]
	result.append(val)
	tok.translated = " ".join(result)

	else:
	# Modes 1, 2 & 5: 'fast', 'advanced', or 'advanced_hanviet' (POS-based noun/adjective Hán Việt override)
	if is_noun_or_adj:
	# Nouns/Adjectives: Bypasses vietphrase
	if word in self.proper_names:
	tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
	else:
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
	tok.translated = word
	else:
	result = []
	for char in word:
	val = self.char_dict.get(char, char).split("/")[0]
	result.append(val)
	tok.translated = " ".join(result)
	else:
	# Verbs and other parts of speech
	if active_mode == 'advanced_hanviet':
	# Prefer HanViet dictionary (proper_names) over Vietphrase
	if word in self.proper_names:
	tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
	elif word in self.vietphrase:
	tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False)
	else:
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
	tok.translated = word
	else:
	result = []
	for char in word:
	val = self.char_dict.get(char, char).split("/")[0]
	result.append(val)
	tok.translated = " ".join(result)
	else:
	# Standard fast/advanced: vietphrase -> proper_names -> character fallback
	if word in self.vietphrase:
	tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False)
	elif word in self.proper_names:
	tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
	else:
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
	tok.translated = word
	else:
	result = []
	for char in word:
	val = self.char_dict.get(char, char).split("/")[0]
	result.append(val)
	tok.translated = " ".join(result)

	# Strip trailing "đích" / "Đích" from modifier translations
	if tok.translated and word.endswith('的') and len(word) > 1:
	val = tok.translated
	if val.lower().endswith(' đích'):
	tok.translated = val[:-5]
	elif val.lower().endswith('đích'):
	tok.translated = val[:-4]

	# Step 1: Group numeral phrases and cultivation terms FIRST
	grouped = []
	i = 0
	while i < len(raw_tokens):
	tok = raw_tokens[i]
	word = tok.word
	tag = tok.tag

	if self.is_number(word) and i + 1 < len(raw_tokens) and raw_tokens[i+1].word in NUM_KEYWORDS:
	grouped_word = word + raw_tokens[i+1].word
	i_next = i + 2
	if i_next < len(raw_tokens) and raw_tokens[i_next].tag in {'n', 'nr', 'ns', 'nt', 'nz'}:
	grouped_word += raw_tokens[i_next].word
	i_next += 1
	grouped.append(SimpleToken(grouped_word, 'cultivation'))
	i = i_next
	else:
	grouped.append(SimpleToken(word, tag))
	i += 1

	# Step 2: Translate individual tokens on the cultivation-grouped tokens
	for idx, tok in enumerate(grouped):
	translate_single_token(idx, tok, grouped)

	# Step 3: Greedy merge adjacent tokens if their combination exists in dictionaries
	i = 0
	merged = []
	while i < len(grouped):
	matched = False
	for length in range(min(4, len(grouped) - i), 1, -1):
	combined_word = "".join([grouped[i+k].word for k in range(length)])

	# Prevent merging across '的' particle to preserve root Hán Việt translation and allow reordering
	should_skip = False
	if 'đích' in combined_word or '的' in combined_word and combined_word.find('的') > 0:
	should_skip = True
	elif i + length < len(grouped) and grouped[i+length].word == '的':
	# If next token is 'de' (de/的), don't merge if it would swallow a pronoun/noun/verb
	last_tok = grouped[i+length-1]
	if last_tok.flag in {'r', 'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v'}:
	should_skip = True
	elif '是' in combined_word and any(p in combined_word for p in {'我', '你', 'he', 'she', 'it', '们', '您', '自己'}):
	# Prevent merging copula + pronoun phrases (like '这是他', '那是我') to allow proper clause reordering
	should_skip = True

	# Dict check strategy depends on active_mode
	if active_mode == 'hanviet':
	in_dicts = (combined_word in self.proper_names)
	else:
	in_dicts = (combined_word in self.vietphrase or combined_word in self.proper_names)

	if not should_skip and in_dicts:
	combined_tag = None
	try:
	cut_res = list(pseg.cut(combined_word))
	if cut_res:
	combined_tag = cut_res[0].flag
	except Exception:
	pass
	if not combined_tag:
	combined_tag = grouped[i].flag
	for k in range(length):
	if grouped[i+k].flag in {'nr', 'ns', 'nt', 'nz'}:
	combined_tag = grouped[i+k].flag
	break
	new_tok = SimpleToken(combined_word, combined_tag)
	# Translate the new merged token immediately
	translate_single_token(0, new_tok, [new_tok])
	merged.append(new_tok)
	i += length
	matched = True
	break
	if not matched:
	merged.append(grouped[i])
	i += 1

	# Step 4: Reordering Grammar Rules
	if active_mode != 'hanviet':
	# Pass 1: Adjective + Noun reordering
	changed = True
	while changed:
	changed = False
	i = 0
	while i < len(merged) - 1:
	t_a = merged[i]
	t_n = merged[i+1]

	# Do not swap with prepositions/conjunctions/copulas/particles
	if t_n.word in {'跟', '和', '与', '與', '同', '在', '从', '從', '自', '由', '向', '往', '朝', '对', '對', '给', '給', '比', '是', '叫', '让', '讓', '被', '把', '使', '令', '到', '了', '的', '而', '&', '并', '並', '以', '或', '者'}:
	i += 1
	continue

	if (t_a.tag in {'a', 'b'} or (t_a.word.endswith('的') and t_a.word != '的')) and t_n.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}:
	combined = t_n.translated + " " + t_a.translated
	new_tok = SimpleToken(t_a.word + t_n.word, t_n.tag)
	new_tok.translated = combined
	merged[i:i+2] = [new_tok]
	changed = True
	break
	i += 1

	# Pass 2: "的" reordering (with multi-token noun/verb phrase lookahead)
	NOUN_PHRASE_TAGS = {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i'}
	LOOKAHEAD_TAGS = NOUN_PHRASE_TAGS \| {'v', 'vd', 'vg', 'vi', 'vn'}
	i = 1
	while i < len(merged) - 1:
	tok = merged[i]
	if tok.word in {'de', '的'}:
	t_x = merged[i-1]
	# Scan forward to collect all consecutive noun or verb phrase tokens
	k = i + 1
	has_noun = False
	while k < len(merged):
	tok_k = merged[k]
	# Stop collecting if we hit a locality word/orientation noun
	if tok_k.word in {'下', '上', '中', '里', '外', '内', '內', '后', '後', '前', '旁', '侧', '側', '底', '间', '間'}:
	break

	# If we already encountered a noun/verb in the phrase,
	# we cannot have a subsequent adjective modifying that noun from the right.
	if has_noun and tok_k.tag in {'a', 'b'}:
	break

	# Do not collect a verb tag if we already have a noun/verb head
	is_verb_tag = tok_k.tag in {'v', 'vd', 'vg', 'vi', 'vn'}
	if has_noun and is_verb_tag:
	break

	if tok_k.tag in LOOKAHEAD_TAGS or tok_k.word == '色':
	if tok_k.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}:
	has_noun = True
	k += 1
	else:
	break

	is_verb_modifier = t_x.tag in {'v', 'vd', 'vg', 'vi', 'vn'}

	# If we collected at least one token AND the modifier is not a verb clause
	if k > i + 1 and not is_verb_modifier:
	y_tokens = merged[i+1:k]
	y_translated = " ".join([t.translated for t in y_tokens if t.translated])
	y_word = "".join([t.word for t in y_tokens])

	if t_x.tag != 'x':
	start_idx = i - 1
	j_back = i - 2
	while j_back >= 0:
	tag_back = merged[j_back].tag
	if tag_back in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i', 's', 't'}:
	start_idx = j_back
	j_back -= 1
	else:
	break

	modifier_tokens = merged[start_idx:i]
	modifier_translated = " ".join([t.translated for t in modifier_tokens if t.translated])
	modifier_word = "".join([t.word for t in modifier_tokens])

	is_proper_or_pronoun = (
	t_x.tag in {'nr', 'r'}
	)
	is_noun_modifier = is_proper_or_pronoun and not t_x.word.endswith('色')
	if is_noun_modifier and start_idx == i - 1:
	combined = y_translated + " của " + modifier_translated
	else:
	combined = y_translated + " " + modifier_translated

	new_tok = SimpleToken(modifier_word + tok.word + y_word, 'n')
	new_tok.translated = combined
	merged[start_idx:k] = [new_tok]
	continue
	else:
	# If we didn't reorder, set the '的' translation to empty string to avoid translating as 'đấy' / 'đích'
	tok.translated = ""
	i += 1

	# Join words
	translated_text = " ".join([t.translated for t in merged if t.translated])

	# Clean spacing and punctuation
	translated_text = self.clean_punctuation_spacing(translated_text)

	# Capitalize sentences
	sentences = re.split(r'([.!?]\s*)', translated_text)
	start_idx = 0 if capitalize_first else 1
	for idx in range(start_idx, len(sentences)):
	s = sentences[idx]
	if s and not s.isspace() and not s[0] in {'.', '!', '?'}:
	for c_idx, char in enumerate(s):
	if char.isalpha():
	sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:]
	break
	return "".join(sentences).strip()

	def translate_paragraph(self, paragraph, multi_option=False, mode=None):
	if not paragraph or paragraph.isspace():
	return paragraph

	active_mode = mode or self.translation_mode

	if active_mode in ('vietphrase', 'hanviet'):
	# Ultra-fast Trie-based translation path (50M+ characters/minute)
	trie = self.vietphrase_trie if active_mode == 'vietphrase' else self.hanviet_trie
	prefer_hanviet = (active_mode == 'hanviet')

	i = 0
	text_length = len(paragraph)
	result_words = []

	while i < text_length:
	length, translation, priority = trie.search_longest_match(paragraph, i)
	if length > 0:
	word = paragraph[i:i+length]
	formatted = self.format_translation(translation, multi_option, word, prefer_hanviet=prefer_hanviet)
	# Capitalize if it is a proper name
	if priority == 1 or word in self.proper_names:
	formatted = self.capitalize_phrase(formatted)
	result_words.append(formatted)
	i += length
	else:
	char = paragraph[i]
	if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', char):
	punct_map = {
	'，': ',', '。': '.', '「': '"', '」': '"', '、': ',', '？': '?', '！': '!',
	'：': ':', '；': ';', '“': '"', '”': '"', '（': '(', '）': ')',
	'『': '"', '』': '"', '【': '[', '】': ']'
	}
	result_words.append(punct_map.get(char, char))
	else:
	val = self.char_dict.get(char, char).split("/")[0]
	result_words.append(val)
	i += 1

	translated_text = " ".join(result_words)
	translated_text = self.clean_punctuation_spacing(translated_text)

	# Sentence Capitalization
	sentences = re.split(r'([.!?]\s*)', translated_text)
	for idx in range(len(sentences)):
	s = sentences[idx]
	if s and not s.isspace() and not s[0] in {'.', '!', '?'}:
	for c_idx, char in enumerate(s):
	if char.isalpha():
	sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:]
	break
	return "".join(sentences).strip()

	# For advanced & fast modes, use normal sentence splitting & tokenization
	sentence_ends = re.compile(r'([。！？!?]+)')
	parts = sentence_ends.split(paragraph)

	translated_parts = []
	for part in parts:
	if not part:
	continue
	if sentence_ends.match(part):
	punct_map = {
	'。': '.', '！': '!', '？': '?', '，': ','
	}
	translated_parts.append(punct_map.get(part, part))
	else:
	translated_parts.append(self.translate_sentence(part, multi_option, mode=mode))

	return self.clean_punctuation_spacing("".join(translated_parts))

	def translate_text_node(self, text, multi_option=False, mode=None):
	"""
	Dich mot text node tu DOM.
	BAO TOAN HOAN TOAN cau truc: xuong dong \n, khoang trang dau/cuoi tung dong.
	"""
	if not text:
	return text

	# Tach theo \n truoc -> dich tung dong doc lap -> gop lai
	lines = text.split('\n')
	translated_lines = []
	for line in lines:
	leading = re.match(r'^\s*', line).group(0)
	trailing = re.search(r'\s*$', line).group(0)
	body = line.strip()

	if not body:
	translated_lines.append(line) # dong rong -> giu nguyen
	elif not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', body):
	translated_lines.append(line) # khong co chu Han -> giu nguyen
	else:
	translated_body = self.translate_paragraph(body, multi_option, mode=mode)
	translated_lines.append(leading + translated_body + trailing)

	return '\n'.join(translated_lines)

	def translate(self, text, multi_option=False, mode=None):
	return self.translate_text_node(text, multi_option=multi_option, mode=mode)