tienhiep-api / backend /engine /engine.py
Cong123779
deploy: update backend production to new Space
d9bfc2d
Raw
History Blame Contribute Delete
39.3 kB
import os
import sys
import re
import jieba.posseg as pseg
from backend.config import Config
NUM_RE = re.compile(r'^[0-9一二三四五六七八九十百千万几数多半两]+$')
PUNCT_SET = {',', '.', '!', '?', ';', ':', '"', '\'', '(', ')', '[', ']', '{', '}',
',', '。', '!', '?', ';', ':', '“', '”', '‘', '’', '(', ')', '【', '】', '《', '》', '、', '—', '~'}
class SimpleToken:
def __init__(self, word, tag):
self.word = word
self.tag = tag
self.translated = None
@property
def flag(self):
return self.tag
@flag.setter
def flag(self, value):
self.tag = value
class VietphraseEngine:
def __init__(self, config=None):
self.config = config or {}
self.load_dictionaries()
# Check translation mode
self.translation_mode = self.config.get("translation", {}).get("mode", "advanced")
if self.config.get("translation", {}).get("fast_mode", False):
self.translation_mode = "fast"
# Warm-up jieba
import jieba
try:
# Fix segmenter splitting overlapping words like 重生于
jieba.add_word("重生", tag="v")
jieba.suggest_freq(("生", "于"), True)
jieba.suggest_freq(("着", "重"), True)
jieba.suggest_freq(("醉", "人"), True)
# Tag grades as nouns instead of proper names (nr)
jieba.add_word("高一", tag="n")
jieba.add_word("高二", tag="n")
jieba.add_word("高三", tag="n")
except Exception as e:
print("Error initializing custom word splits in Jieba:", e)
# Always initialize both tokenizers to support dynamic mode switching
self.jieba_tokenizer = jieba.dt
self.pseg_dict = pseg.dt.word_tag_tab
list(self.jieba_tokenizer.cut("暖洋洋"))
list(pseg.cut("暖洋洋"))
def load_dictionaries(self):
paths = self.config.get("paths", {}).get("dictionaries", {})
vp_path = paths.get("vietphrase", "")
if not vp_path or not os.path.isabs(vp_path):
vp_path = os.path.join(Config.ROOT_DIR, vp_path or "dictionaries/Vietphrase.txt")
dict_dir = os.path.dirname(vp_path)
# Check for encrypted .bin dictionaries first, then fallback to .txt
def load_file_content(base_name):
bin_file = os.path.join(dict_dir, base_name + ".bin")
txt_file = os.path.join(dict_dir, base_name + ".txt")
if os.path.exists(bin_file):
# Decrypt XOR
with open(bin_file, "rb") as f:
data = f.read()
key_bytes = "quick_translator_secret_key_2026".encode("utf-8")
key_len = len(key_bytes)
repeated_key = (key_bytes * (len(data) // key_len + 1))[:len(data)]
decrypted = bytes(a ^ b for a, b in zip(data, repeated_key))
return decrypted.decode("utf-8")
elif os.path.exists(txt_file):
with open(txt_file, "r", encoding="utf-8") as f:
return f.read()
return ""
print("Loading dictionaries in VietphraseEngine...")
self.char_dict = self.parse_dict_content(load_file_content("HanViet_CharDict"))
# --- ADD HÁn Nôm FALLBACK ---
import csv
han_csv_path = os.path.join(dict_dir, "han_all_readings.csv")
if os.path.exists(han_csv_path):
try:
with open(han_csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
char = row.get("Ký_tự", "").strip()
hv = row.get("Hán_Việt", "").strip()
if char and hv and char not in self.char_dict:
self.char_dict[char] = hv.replace("~", "")
print("Loaded han_all_readings.csv as fallback for missing Chinese characters.")
except Exception as e:
print("Could not load han_all_readings.csv:", e)
self.proper_names = self.parse_dict_content(load_file_content("Aligned_HanViet"), convert_to_simplified=True)
vp_content = load_file_content("Vietphrase")
self.vietphrase = self.parse_vietphrase_content(vp_content)
print("Dictionaries loaded successfully.")
# Build Tries for vietphrase and hanviet modes
from .trie import Trie
print("Building Tries for fast translation modes...")
self.vietphrase_trie = Trie()
# Insert proper names (priority 1)
for k, v in self.proper_names.items():
self.vietphrase_trie.insert(k, v, 1)
# Insert Vietphrase (priority 2 - higher)
for k, v in self.vietphrase.items():
self.vietphrase_trie.insert(k, v, 2)
self.hanviet_trie = Trie()
# Insert proper names (priority 2)
for k, v in self.proper_names.items():
self.hanviet_trie.insert(k, v, 2)
print("Tries built successfully.")
# Register proper names in Jieba dictionary for fast modes
import jieba
for name in self.proper_names:
jieba.add_word(name)
def parse_dict_content(self, content, convert_to_simplified=False):
dictionary = {}
if content:
to_simplified = lambda s: s
if convert_to_simplified:
try:
from hanziconv import HanziConv
to_simplified = HanziConv.toSimplified
except ImportError:
pass
for line in content.splitlines():
line = line.strip()
if not line or "=" not in line or line.startswith('#'):
continue
parts = line.split("=", 1)
key = parts[0].strip()
val = self.clean_annotation(parts[1].strip())
dictionary[to_simplified(key)] = val
return dictionary
def parse_vietphrase_content(self, content):
dictionary = {}
if content:
for line in content.splitlines():
line = line.strip()
if not line or "=" not in line or line.startswith('#'):
continue
parts = line.split("=", 1)
left = parts[0].strip()
right = self.clean_annotation(parts[1].strip())
if "," in left and "," in right:
keys = [k.strip() for k in left.split(",") if k.strip()]
vals = [v.strip() for v in right.split(",") if v.strip()]
if len(keys) == len(vals):
for k, v in zip(keys, vals):
dictionary[k] = v
continue
if left:
dictionary[left] = right
return dictionary
def is_number(self, word):
return bool(re.match(r'^[0-9一二三四五六七八九十百千万几数多半两]+$', word))
def capitalize_phrase(self, phrase):
chars = 'a-zA-ZàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđĐ'
pattern = f'[{chars}]+'
return re.sub(pattern, lambda m: m.group(0).capitalize(), phrase)
def clean_annotation(self, text, mode='vietphrase'):
if not text:
return ""
# 1. Parse curly braces {meaning:reading}
def repl_curly(match):
content = match.group(1)
if ':' in content:
parts = content.split(':', 1)
return parts[0].strip() if mode == 'vietphrase' else parts[1].strip()
return content.strip()
text = re.sub(r'\{([^{}]+)\}', repl_curly, text)
# 2. Strip (*...) annotations
text = re.sub(r'\s*\(\*[^)]*\)', '', text)
return text.strip()
def format_translation(self, raw_value, multi_option, word=None, prefer_hanviet=False):
if not raw_value:
return ""
options = [o for o in raw_value.split("/") if o.strip()]
# Deduplicate options while preserving order
seen = set()
deduped = []
for o in options:
if o not in seen:
seen.add(o)
deduped.append(o)
if not deduped:
return ""
if multi_option and len(deduped) > 1:
return f"{deduped[0]}[{'/'.join(deduped[1:])}]"
# If multi-option is False, we have a word of length >= 2, and prefer_hanviet is True, prefer Hán Việt alignment
if prefer_hanviet and word and len(word) >= 2 and len(deduped) > 1:
hv_sets = []
for char in word:
readings = set()
if char in self.char_dict:
for r in self.char_dict[char].split('/'):
r_clean = r.strip().lower()
if r_clean:
readings.add(r_clean)
if readings:
hv_sets.append(readings)
best_option = deduped[0]
best_score = -1
for opt in deduped:
opt_syllables = [w.strip().lower() for w in opt.split() if w.strip()]
score = 0
for r_set in hv_sets:
if any(r in opt_syllables for r in r_set):
score += 1
if score > best_score:
best_score = score
best_option = opt
if best_score > 0:
return best_option
return deduped[0]
def clean_punctuation_spacing(self, text):
if not text:
return text
# 1. Ensure exactly one space after commas, semicolons, colons, periods, question marks, and exclamation marks.
# Avoid inserting space if the next character is a closing bracket, closing quote, space, or another punctuation.
text = re.sub(r'([,;.:!?])(?=[^\s)\]}』】”"’])', r'\1 ', text)
# 2. Remove any accidental whitespace before these punctuation marks
text = re.sub(r'\s+([,;.:!?])', r'\1', text)
# 3. Clean spaces inside parentheses, brackets, and curly/double brackets (including Chinese quote styles)
text = re.sub(r'([(\[{『【«])\s+', r'\1', text)
text = re.sub(r'\s+([)\]}』】»])', r'\1', text)
# Ensure a space exists before opening brackets and after closing brackets when they border words/digits
text = re.sub(r'(?<=[^\s(\[{『【«])([(\[{『【«])', r' \1', text)
text = re.sub(r'([)\]}』】»])(?=[^\s.,;:!?)\]}』】»])', r'\1 ', text)
# 4. Standardize dashes/hyphens used as separators (e.g. "Artist - Song") to have one space on each side
text = re.sub(r'\s*-\s*', ' - ', text)
# 5. Clean up any duplicated/trailing whitespaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def translate_sentence(self, sentence, multi_option=False, mode=None):
if not sentence or sentence.isspace():
return ""
# If the sentence doesn't contain any Chinese characters or symbols, preserve it as-is
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', sentence):
return sentence
# Segment into Chinese text blocks and non-Chinese text blocks
# Keep Chinese characters and Chinese specific punctuations in the translation segment
chinese_pattern = re.compile(r'([\u4e00-\u9fff\u3000-\u303f\uff00-\uffef]+)')
parts = chinese_pattern.split(sentence)
# Merge simple alphanumeric non-Chinese blocks into adjacent Chinese blocks
i = 1
while i < len(parts) - 1:
non_chinese = parts[i+1]
if re.match(r'^\s*[a-zA-Z0-9]+\s*$', non_chinese):
parts[i] = parts[i] + non_chinese + parts[i+2]
parts.pop(i+1)
parts.pop(i+1)
else:
i += 2
translated_parts = []
capitalize_next = True
for part in parts:
if not part:
continue
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', part):
# Non-Chinese segment -> preserve exactly
translated_parts.append(part)
# Check if it ends with sentence terminator
if re.search(r'[.!?]\s*$', part):
capitalize_next = True
elif part.strip():
capitalize_next = False
else:
# Chinese segment -> translate
trans = self._translate_pure_chinese_sentence(part, multi_option, mode, capitalize_first=capitalize_next)
translated_parts.append(trans)
# Check if it ends with sentence terminator
if re.search(r'[.!?]\s*$', part) or re.search(r'[.!?]\s*$', trans):
capitalize_next = True
else:
capitalize_next = False
return "".join(translated_parts)
def _translate_pure_chinese_sentence(self, sentence, multi_option=False, mode=None, capitalize_first=True):
if not sentence or sentence.isspace():
return ""
active_mode = mode or self.translation_mode
# Tokenization & Tagging depending on mode
if active_mode in ("advanced", "advanced_hanviet"):
raw_tokens = [SimpleToken(t.word, t.flag) for t in pseg.cut(sentence)]
else:
# "fast", "vietphrase", "hanviet" modes use the fast tokenizer
words = list(self.jieba_tokenizer.cut(sentence))
raw_tokens = []
for w in words:
if w in PUNCT_SET:
tag = 'x'
elif NUM_RE.match(w):
tag = 'm'
else:
tag = self.pseg_dict.get(w, 'n')
raw_tokens.append(SimpleToken(w, tag))
if not raw_tokens:
return ""
NUM_KEYWORDS = {"重", "阶", "品", "级", "层", "剑", "星", "转", "天", "色", "关", "重天"}
HANVIET_NUMBERS = {
'0': 'Không', '1': 'Nhất', '2': 'Nhị', '3': 'Tam', '4': 'Tứ', '5': 'Ngũ', '6': 'Lục', '7': 'Thất', '8': 'Bát', '9': 'Cửu', '10': 'Thập',
'一': 'Nhất', '二': 'Nhị', '三': 'Tam', '四': 'Tứ', '五': 'Ngũ', '六': 'Lục', '七': 'Thất', '八': 'Bát', '九': 'Cửu', '十': 'Thập',
'百': 'Bách', '千': 'Thiên', '万': 'Vạn', '萬': 'Vạn', '几': 'Vài', '数': 'Số', '多': 'Đa', '半': 'Bán', '两': 'Lưỡng', '兩': 'Lưỡng'
}
# Helper function to translate a single token
def translate_single_token(idx, tok, list_of_tokens):
word = tok.word
tag = tok.tag
# Punctuation
is_punct = (tag == 'x' or word in {',', '.', '!', '?', ';', ':', '"', '(', ')', '[', ']', '{', '}'})
if is_punct:
has_chinese = False
for char in word:
if char in self.char_dict:
has_chinese = True
break
if not has_chinese:
punct_map = {
',': ',', '。': '.', '「': '"', '」': '"', '、': ',', '?': '?', '!': '!',
':': ':', ';': ';', '“': '"', '”': '"', '(': '(', ')': ')'
}
tok.translated = punct_map.get(word, word)
return
# Rule for number + 人 (e.g. 几十人, 三人)
if len(word) > 1 and word.endswith('人') and self.is_number(word[:-1]):
num_part = word[:-1]
if num_part in self.vietphrase:
num_trans = self.format_translation(self.vietphrase[num_part], multi_option, num_part)
else:
num_trans = " ".join([self.char_dict.get(c, c).split("/")[0] for c in num_part])
tok.translated = f"{num_trans} người"
return
# Special rule for 了 (le vs liao)
if word == 'l' or word == '了':
is_at_end = True
for next_tok in list_of_tokens[idx+1:]:
if next_tok.word in {'"', '\'', '(', ')', '[', ']', '{', '}', '“', '”', '‘', '’', '(', ')', '【', '】', '《', '》'}:
continue
if next_tok.word in {',', '.', '!', '?', ';', ':', ',', '。', '!', '?', ';', ':', '、'}:
is_at_end = True
break
is_at_end = False
break
if is_at_end:
tok.translated = "rồi"
else:
tok.translated = "được"
return
# Cultivation Realm (cultivation)
if tag == 'cultivation':
result = []
for char in word:
if char in HANVIET_NUMBERS:
result.append(HANVIET_NUMBERS[char])
else:
cap_val = self.char_dict.get(char, char).split("/")[0].capitalize()
result.append(cap_val)
tok.translated = " ".join(result)
return
# Determine if it's a noun or an adjective
is_proper = (tag in {'nr', 'ns', 'nt'} if tag else False)
is_noun = (tag.startswith('n') if tag else False) or tag in {'n', 'nz', 'ng'} if tag else False
is_adj = tag in {'a', 'b', 'ad', 'an', 'z'} if tag else False
is_noun_or_adj = is_proper or is_noun or is_adj
# --- Chốt chặn cuối cùng cho Tên riêng (Proper Names Guard) ---
if is_proper:
if word in self.proper_names:
tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
else:
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
tok.translated = word
else:
result = []
for char in word:
val = self.char_dict.get(char, char).split("/")[0]
result.append(val)
tok.translated = " ".join(result)
if tok.translated:
tok.translated = self.capitalize_phrase(tok.translated)
# --- Translate lookup strategy depending on active_mode (for non-proper names) ---
else:
if active_mode == 'hanviet':
# Mode 4: Pure Hán Việt (NO Vietphrase)
if word in self.proper_names:
tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
else:
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
tok.translated = word
else:
result = []
for char in word:
val = self.char_dict.get(char, char).split("/")[0]
result.append(val)
tok.translated = " ".join(result)
elif active_mode == 'vietphrase':
# Mode 3: Prioritize Vietphrase (Traditional)
if word in self.vietphrase:
tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False)
elif word in self.proper_names:
tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
else:
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
tok.translated = word
else:
result = []
for char in word:
val = self.char_dict.get(char, char).split("/")[0]
result.append(val)
tok.translated = " ".join(result)
else:
# Modes 1, 2 & 5: 'fast', 'advanced', or 'advanced_hanviet' (POS-based noun/adjective Hán Việt override)
if is_noun_or_adj:
# Nouns/Adjectives: Bypasses vietphrase
if word in self.proper_names:
tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
else:
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
tok.translated = word
else:
result = []
for char in word:
val = self.char_dict.get(char, char).split("/")[0]
result.append(val)
tok.translated = " ".join(result)
else:
# Verbs and other parts of speech
if active_mode == 'advanced_hanviet':
# Prefer HanViet dictionary (proper_names) over Vietphrase
if word in self.proper_names:
tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
elif word in self.vietphrase:
tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False)
else:
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
tok.translated = word
else:
result = []
for char in word:
val = self.char_dict.get(char, char).split("/")[0]
result.append(val)
tok.translated = " ".join(result)
else:
# Standard fast/advanced: vietphrase -> proper_names -> character fallback
if word in self.vietphrase:
tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False)
elif word in self.proper_names:
tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True)
else:
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word):
tok.translated = word
else:
result = []
for char in word:
val = self.char_dict.get(char, char).split("/")[0]
result.append(val)
tok.translated = " ".join(result)
# Strip trailing "đích" / "Đích" from modifier translations
if tok.translated and word.endswith('的') and len(word) > 1:
val = tok.translated
if val.lower().endswith(' đích'):
tok.translated = val[:-5]
elif val.lower().endswith('đích'):
tok.translated = val[:-4]
# Step 1: Group numeral phrases and cultivation terms FIRST
grouped = []
i = 0
while i < len(raw_tokens):
tok = raw_tokens[i]
word = tok.word
tag = tok.tag
if self.is_number(word) and i + 1 < len(raw_tokens) and raw_tokens[i+1].word in NUM_KEYWORDS:
grouped_word = word + raw_tokens[i+1].word
i_next = i + 2
if i_next < len(raw_tokens) and raw_tokens[i_next].tag in {'n', 'nr', 'ns', 'nt', 'nz'}:
grouped_word += raw_tokens[i_next].word
i_next += 1
grouped.append(SimpleToken(grouped_word, 'cultivation'))
i = i_next
else:
grouped.append(SimpleToken(word, tag))
i += 1
# Step 2: Translate individual tokens on the cultivation-grouped tokens
for idx, tok in enumerate(grouped):
translate_single_token(idx, tok, grouped)
# Step 3: Greedy merge adjacent tokens if their combination exists in dictionaries
i = 0
merged = []
while i < len(grouped):
matched = False
for length in range(min(4, len(grouped) - i), 1, -1):
combined_word = "".join([grouped[i+k].word for k in range(length)])
# Prevent merging across '的' particle to preserve root Hán Việt translation and allow reordering
should_skip = False
if 'đích' in combined_word or '的' in combined_word and combined_word.find('的') > 0:
should_skip = True
elif i + length < len(grouped) and grouped[i+length].word == '的':
# If next token is 'de' (de/的), don't merge if it would swallow a pronoun/noun/verb
last_tok = grouped[i+length-1]
if last_tok.flag in {'r', 'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v'}:
should_skip = True
elif '是' in combined_word and any(p in combined_word for p in {'我', '你', 'he', 'she', 'it', '们', '您', '自己'}):
# Prevent merging copula + pronoun phrases (like '这是他', '那是我') to allow proper clause reordering
should_skip = True
# Dict check strategy depends on active_mode
if active_mode == 'hanviet':
in_dicts = (combined_word in self.proper_names)
else:
in_dicts = (combined_word in self.vietphrase or combined_word in self.proper_names)
if not should_skip and in_dicts:
combined_tag = None
try:
cut_res = list(pseg.cut(combined_word))
if cut_res:
combined_tag = cut_res[0].flag
except Exception:
pass
if not combined_tag:
combined_tag = grouped[i].flag
for k in range(length):
if grouped[i+k].flag in {'nr', 'ns', 'nt', 'nz'}:
combined_tag = grouped[i+k].flag
break
new_tok = SimpleToken(combined_word, combined_tag)
# Translate the new merged token immediately
translate_single_token(0, new_tok, [new_tok])
merged.append(new_tok)
i += length
matched = True
break
if not matched:
merged.append(grouped[i])
i += 1
# Step 4: Reordering Grammar Rules
if active_mode != 'hanviet':
# Pass 1: Adjective + Noun reordering
changed = True
while changed:
changed = False
i = 0
while i < len(merged) - 1:
t_a = merged[i]
t_n = merged[i+1]
# Do not swap with prepositions/conjunctions/copulas/particles
if t_n.word in {'跟', '和', '与', '與', '同', '在', '从', '從', '自', '由', '向', '往', '朝', '对', '對', '给', '給', '比', '是', '叫', '让', '讓', '被', '把', '使', '令', '到', '了', '的', '而', '&', '并', '並', '以', '或', '者'}:
i += 1
continue
if (t_a.tag in {'a', 'b'} or (t_a.word.endswith('的') and t_a.word != '的')) and t_n.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}:
combined = t_n.translated + " " + t_a.translated
new_tok = SimpleToken(t_a.word + t_n.word, t_n.tag)
new_tok.translated = combined
merged[i:i+2] = [new_tok]
changed = True
break
i += 1
# Pass 2: "的" reordering (with multi-token noun/verb phrase lookahead)
NOUN_PHRASE_TAGS = {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i'}
LOOKAHEAD_TAGS = NOUN_PHRASE_TAGS | {'v', 'vd', 'vg', 'vi', 'vn'}
i = 1
while i < len(merged) - 1:
tok = merged[i]
if tok.word in {'de', '的'}:
t_x = merged[i-1]
# Scan forward to collect all consecutive noun or verb phrase tokens
k = i + 1
has_noun = False
while k < len(merged):
tok_k = merged[k]
# Stop collecting if we hit a locality word/orientation noun
if tok_k.word in {'下', '上', '中', '里', '外', '内', '內', '后', '後', '前', '旁', '侧', '側', '底', '间', '間'}:
break
# If we already encountered a noun/verb in the phrase,
# we cannot have a subsequent adjective modifying that noun from the right.
if has_noun and tok_k.tag in {'a', 'b'}:
break
# Do not collect a verb tag if we already have a noun/verb head
is_verb_tag = tok_k.tag in {'v', 'vd', 'vg', 'vi', 'vn'}
if has_noun and is_verb_tag:
break
if tok_k.tag in LOOKAHEAD_TAGS or tok_k.word == '色':
if tok_k.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}:
has_noun = True
k += 1
else:
break
is_verb_modifier = t_x.tag in {'v', 'vd', 'vg', 'vi', 'vn'}
# If we collected at least one token AND the modifier is not a verb clause
if k > i + 1 and not is_verb_modifier:
y_tokens = merged[i+1:k]
y_translated = " ".join([t.translated for t in y_tokens if t.translated])
y_word = "".join([t.word for t in y_tokens])
if t_x.tag != 'x':
start_idx = i - 1
j_back = i - 2
while j_back >= 0:
tag_back = merged[j_back].tag
if tag_back in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i', 's', 't'}:
start_idx = j_back
j_back -= 1
else:
break
modifier_tokens = merged[start_idx:i]
modifier_translated = " ".join([t.translated for t in modifier_tokens if t.translated])
modifier_word = "".join([t.word for t in modifier_tokens])
is_proper_or_pronoun = (
t_x.tag in {'nr', 'r'}
)
is_noun_modifier = is_proper_or_pronoun and not t_x.word.endswith('色')
if is_noun_modifier and start_idx == i - 1:
combined = y_translated + " của " + modifier_translated
else:
combined = y_translated + " " + modifier_translated
new_tok = SimpleToken(modifier_word + tok.word + y_word, 'n')
new_tok.translated = combined
merged[start_idx:k] = [new_tok]
continue
else:
# If we didn't reorder, set the '的' translation to empty string to avoid translating as 'đấy' / 'đích'
tok.translated = ""
i += 1
# Join words
translated_text = " ".join([t.translated for t in merged if t.translated])
# Clean spacing and punctuation
translated_text = self.clean_punctuation_spacing(translated_text)
# Capitalize sentences
sentences = re.split(r'([.!?]\s*)', translated_text)
start_idx = 0 if capitalize_first else 1
for idx in range(start_idx, len(sentences)):
s = sentences[idx]
if s and not s.isspace() and not s[0] in {'.', '!', '?'}:
for c_idx, char in enumerate(s):
if char.isalpha():
sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:]
break
return "".join(sentences).strip()
def translate_paragraph(self, paragraph, multi_option=False, mode=None):
if not paragraph or paragraph.isspace():
return paragraph
active_mode = mode or self.translation_mode
if active_mode in ('vietphrase', 'hanviet'):
# Ultra-fast Trie-based translation path (50M+ characters/minute)
trie = self.vietphrase_trie if active_mode == 'vietphrase' else self.hanviet_trie
prefer_hanviet = (active_mode == 'hanviet')
i = 0
text_length = len(paragraph)
result_words = []
while i < text_length:
length, translation, priority = trie.search_longest_match(paragraph, i)
if length > 0:
word = paragraph[i:i+length]
formatted = self.format_translation(translation, multi_option, word, prefer_hanviet=prefer_hanviet)
# Capitalize if it is a proper name
if priority == 1 or word in self.proper_names:
formatted = self.capitalize_phrase(formatted)
result_words.append(formatted)
i += length
else:
char = paragraph[i]
if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', char):
punct_map = {
',': ',', '。': '.', '「': '"', '」': '"', '、': ',', '?': '?', '!': '!',
':': ':', ';': ';', '“': '"', '”': '"', '(': '(', ')': ')',
'『': '"', '』': '"', '【': '[', '】': ']'
}
result_words.append(punct_map.get(char, char))
else:
val = self.char_dict.get(char, char).split("/")[0]
result_words.append(val)
i += 1
translated_text = " ".join(result_words)
translated_text = self.clean_punctuation_spacing(translated_text)
# Sentence Capitalization
sentences = re.split(r'([.!?]\s*)', translated_text)
for idx in range(len(sentences)):
s = sentences[idx]
if s and not s.isspace() and not s[0] in {'.', '!', '?'}:
for c_idx, char in enumerate(s):
if char.isalpha():
sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:]
break
return "".join(sentences).strip()
# For advanced & fast modes, use normal sentence splitting & tokenization
sentence_ends = re.compile(r'([。!?!?]+)')
parts = sentence_ends.split(paragraph)
translated_parts = []
for part in parts:
if not part:
continue
if sentence_ends.match(part):
punct_map = {
'。': '.', '!': '!', '?': '?', ',': ','
}
translated_parts.append(punct_map.get(part, part))
else:
translated_parts.append(self.translate_sentence(part, multi_option, mode=mode))
return self.clean_punctuation_spacing("".join(translated_parts))
def translate_text_node(self, text, multi_option=False, mode=None):
"""
Dich mot text node tu DOM.
BAO TOAN HOAN TOAN cau truc: xuong dong \n, khoang trang dau/cuoi tung dong.
"""
if not text:
return text
# Tach theo \n truoc -> dich tung dong doc lap -> gop lai
lines = text.split('\n')
translated_lines = []
for line in lines:
leading = re.match(r'^\s*', line).group(0)
trailing = re.search(r'\s*$', line).group(0)
body = line.strip()
if not body:
translated_lines.append(line) # dong rong -> giu nguyen
elif not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', body):
translated_lines.append(line) # khong co chu Han -> giu nguyen
else:
translated_body = self.translate_paragraph(body, multi_option, mode=mode)
translated_lines.append(leading + translated_body + trailing)
return '\n'.join(translated_lines)
def translate(self, text, multi_option=False, mode=None):
return self.translate_text_node(text, multi_option=multi_option, mode=mode)