import os import sys import re import jieba.posseg as pseg from backend.config import Config NUM_RE = re.compile(r'^[0-9一二三四五六七八九十百千万几数多半两]+$') PUNCT_SET = {',', '.', '!', '?', ';', ':', '"', '\'', '(', ')', '[', ']', '{', '}', ',', '。', '!', '?', ';', ':', '“', '”', '‘', '’', '(', ')', '【', '】', '《', '》', '、', '—', '~'} class SimpleToken: def __init__(self, word, tag): self.word = word self.tag = tag self.translated = None @property def flag(self): return self.tag @flag.setter def flag(self, value): self.tag = value class VietphraseEngine: def __init__(self, config=None): self.config = config or {} self.load_dictionaries() # Check translation mode self.translation_mode = self.config.get("translation", {}).get("mode", "advanced") if self.config.get("translation", {}).get("fast_mode", False): self.translation_mode = "fast" # Warm-up jieba import jieba try: # Fix segmenter splitting overlapping words like 重生于 jieba.add_word("重生", tag="v") jieba.suggest_freq(("生", "于"), True) jieba.suggest_freq(("着", "重"), True) jieba.suggest_freq(("醉", "人"), True) # Tag grades as nouns instead of proper names (nr) jieba.add_word("高一", tag="n") jieba.add_word("高二", tag="n") jieba.add_word("高三", tag="n") except Exception as e: print("Error initializing custom word splits in Jieba:", e) # Always initialize both tokenizers to support dynamic mode switching self.jieba_tokenizer = jieba.dt self.pseg_dict = pseg.dt.word_tag_tab list(self.jieba_tokenizer.cut("暖洋洋")) list(pseg.cut("暖洋洋")) def load_dictionaries(self): paths = self.config.get("paths", {}).get("dictionaries", {}) vp_path = paths.get("vietphrase", "") if not vp_path or not os.path.isabs(vp_path): vp_path = os.path.join(Config.ROOT_DIR, vp_path or "dictionaries/Vietphrase.txt") dict_dir = os.path.dirname(vp_path) # Check for encrypted .bin dictionaries first, then fallback to .txt def load_file_content(base_name): bin_file = os.path.join(dict_dir, base_name + ".bin") txt_file = os.path.join(dict_dir, base_name + ".txt") if os.path.exists(bin_file): # Decrypt XOR with open(bin_file, "rb") as f: data = f.read() key_bytes = "quick_translator_secret_key_2026".encode("utf-8") key_len = len(key_bytes) repeated_key = (key_bytes * (len(data) // key_len + 1))[:len(data)] decrypted = bytes(a ^ b for a, b in zip(data, repeated_key)) return decrypted.decode("utf-8") elif os.path.exists(txt_file): with open(txt_file, "r", encoding="utf-8") as f: return f.read() return "" print("Loading dictionaries in VietphraseEngine...") self.char_dict = self.parse_dict_content(load_file_content("HanViet_CharDict")) # --- ADD HÁn Nôm FALLBACK --- import csv han_csv_path = os.path.join(dict_dir, "han_all_readings.csv") if os.path.exists(han_csv_path): try: with open(han_csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: char = row.get("Ký_tự", "").strip() hv = row.get("Hán_Việt", "").strip() if char and hv and char not in self.char_dict: self.char_dict[char] = hv.replace("~", "") print("Loaded han_all_readings.csv as fallback for missing Chinese characters.") except Exception as e: print("Could not load han_all_readings.csv:", e) self.proper_names = self.parse_dict_content(load_file_content("Aligned_HanViet"), convert_to_simplified=True) vp_content = load_file_content("Vietphrase") self.vietphrase = self.parse_vietphrase_content(vp_content) print("Dictionaries loaded successfully.") # Build Tries for vietphrase and hanviet modes from .trie import Trie print("Building Tries for fast translation modes...") self.vietphrase_trie = Trie() # Insert proper names (priority 1) for k, v in self.proper_names.items(): self.vietphrase_trie.insert(k, v, 1) # Insert Vietphrase (priority 2 - higher) for k, v in self.vietphrase.items(): self.vietphrase_trie.insert(k, v, 2) self.hanviet_trie = Trie() # Insert proper names (priority 2) for k, v in self.proper_names.items(): self.hanviet_trie.insert(k, v, 2) print("Tries built successfully.") # Register proper names in Jieba dictionary for fast modes import jieba for name in self.proper_names: jieba.add_word(name) def parse_dict_content(self, content, convert_to_simplified=False): dictionary = {} if content: to_simplified = lambda s: s if convert_to_simplified: try: from hanziconv import HanziConv to_simplified = HanziConv.toSimplified except ImportError: pass for line in content.splitlines(): line = line.strip() if not line or "=" not in line or line.startswith('#'): continue parts = line.split("=", 1) key = parts[0].strip() val = self.clean_annotation(parts[1].strip()) dictionary[to_simplified(key)] = val return dictionary def parse_vietphrase_content(self, content): dictionary = {} if content: for line in content.splitlines(): line = line.strip() if not line or "=" not in line or line.startswith('#'): continue parts = line.split("=", 1) left = parts[0].strip() right = self.clean_annotation(parts[1].strip()) if "," in left and "," in right: keys = [k.strip() for k in left.split(",") if k.strip()] vals = [v.strip() for v in right.split(",") if v.strip()] if len(keys) == len(vals): for k, v in zip(keys, vals): dictionary[k] = v continue if left: dictionary[left] = right return dictionary def is_number(self, word): return bool(re.match(r'^[0-9一二三四五六七八九十百千万几数多半两]+$', word)) def capitalize_phrase(self, phrase): chars = 'a-zA-ZàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđĐ' pattern = f'[{chars}]+' return re.sub(pattern, lambda m: m.group(0).capitalize(), phrase) def clean_annotation(self, text, mode='vietphrase'): if not text: return "" # 1. Parse curly braces {meaning:reading} def repl_curly(match): content = match.group(1) if ':' in content: parts = content.split(':', 1) return parts[0].strip() if mode == 'vietphrase' else parts[1].strip() return content.strip() text = re.sub(r'\{([^{}]+)\}', repl_curly, text) # 2. Strip (*...) annotations text = re.sub(r'\s*\(\*[^)]*\)', '', text) return text.strip() def format_translation(self, raw_value, multi_option, word=None, prefer_hanviet=False): if not raw_value: return "" options = [o for o in raw_value.split("/") if o.strip()] # Deduplicate options while preserving order seen = set() deduped = [] for o in options: if o not in seen: seen.add(o) deduped.append(o) if not deduped: return "" if multi_option and len(deduped) > 1: return f"{deduped[0]}[{'/'.join(deduped[1:])}]" # If multi-option is False, we have a word of length >= 2, and prefer_hanviet is True, prefer Hán Việt alignment if prefer_hanviet and word and len(word) >= 2 and len(deduped) > 1: hv_sets = [] for char in word: readings = set() if char in self.char_dict: for r in self.char_dict[char].split('/'): r_clean = r.strip().lower() if r_clean: readings.add(r_clean) if readings: hv_sets.append(readings) best_option = deduped[0] best_score = -1 for opt in deduped: opt_syllables = [w.strip().lower() for w in opt.split() if w.strip()] score = 0 for r_set in hv_sets: if any(r in opt_syllables for r in r_set): score += 1 if score > best_score: best_score = score best_option = opt if best_score > 0: return best_option return deduped[0] def clean_punctuation_spacing(self, text): if not text: return text # 1. Ensure exactly one space after commas, semicolons, colons, periods, question marks, and exclamation marks. # Avoid inserting space if the next character is a closing bracket, closing quote, space, or another punctuation. text = re.sub(r'([,;.:!?])(?=[^\s)\]}』】”"’])', r'\1 ', text) # 2. Remove any accidental whitespace before these punctuation marks text = re.sub(r'\s+([,;.:!?])', r'\1', text) # 3. Clean spaces inside parentheses, brackets, and curly/double brackets (including Chinese quote styles) text = re.sub(r'([(\[{『【«])\s+', r'\1', text) text = re.sub(r'\s+([)\]}』】»])', r'\1', text) # Ensure a space exists before opening brackets and after closing brackets when they border words/digits text = re.sub(r'(?<=[^\s(\[{『【«])([(\[{『【«])', r' \1', text) text = re.sub(r'([)\]}』】»])(?=[^\s.,;:!?)\]}』】»])', r'\1 ', text) # 4. Standardize dashes/hyphens used as separators (e.g. "Artist - Song") to have one space on each side text = re.sub(r'\s*-\s*', ' - ', text) # 5. Clean up any duplicated/trailing whitespaces text = re.sub(r'\s+', ' ', text).strip() return text def translate_sentence(self, sentence, multi_option=False, mode=None): if not sentence or sentence.isspace(): return "" # If the sentence doesn't contain any Chinese characters or symbols, preserve it as-is if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', sentence): return sentence # Segment into Chinese text blocks and non-Chinese text blocks # Keep Chinese characters and Chinese specific punctuations in the translation segment chinese_pattern = re.compile(r'([\u4e00-\u9fff\u3000-\u303f\uff00-\uffef]+)') parts = chinese_pattern.split(sentence) # Merge simple alphanumeric non-Chinese blocks into adjacent Chinese blocks i = 1 while i < len(parts) - 1: non_chinese = parts[i+1] if re.match(r'^\s*[a-zA-Z0-9]+\s*$', non_chinese): parts[i] = parts[i] + non_chinese + parts[i+2] parts.pop(i+1) parts.pop(i+1) else: i += 2 translated_parts = [] capitalize_next = True for part in parts: if not part: continue if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', part): # Non-Chinese segment -> preserve exactly translated_parts.append(part) # Check if it ends with sentence terminator if re.search(r'[.!?]\s*$', part): capitalize_next = True elif part.strip(): capitalize_next = False else: # Chinese segment -> translate trans = self._translate_pure_chinese_sentence(part, multi_option, mode, capitalize_first=capitalize_next) translated_parts.append(trans) # Check if it ends with sentence terminator if re.search(r'[.!?]\s*$', part) or re.search(r'[.!?]\s*$', trans): capitalize_next = True else: capitalize_next = False return "".join(translated_parts) def _translate_pure_chinese_sentence(self, sentence, multi_option=False, mode=None, capitalize_first=True): if not sentence or sentence.isspace(): return "" active_mode = mode or self.translation_mode # Tokenization & Tagging depending on mode if active_mode in ("advanced", "advanced_hanviet"): raw_tokens = [SimpleToken(t.word, t.flag) for t in pseg.cut(sentence)] else: # "fast", "vietphrase", "hanviet" modes use the fast tokenizer words = list(self.jieba_tokenizer.cut(sentence)) raw_tokens = [] for w in words: if w in PUNCT_SET: tag = 'x' elif NUM_RE.match(w): tag = 'm' else: tag = self.pseg_dict.get(w, 'n') raw_tokens.append(SimpleToken(w, tag)) if not raw_tokens: return "" NUM_KEYWORDS = {"重", "阶", "品", "级", "层", "剑", "星", "转", "天", "色", "关", "重天"} HANVIET_NUMBERS = { '0': 'Không', '1': 'Nhất', '2': 'Nhị', '3': 'Tam', '4': 'Tứ', '5': 'Ngũ', '6': 'Lục', '7': 'Thất', '8': 'Bát', '9': 'Cửu', '10': 'Thập', '一': 'Nhất', '二': 'Nhị', '三': 'Tam', '四': 'Tứ', '五': 'Ngũ', '六': 'Lục', '七': 'Thất', '八': 'Bát', '九': 'Cửu', '十': 'Thập', '百': 'Bách', '千': 'Thiên', '万': 'Vạn', '萬': 'Vạn', '几': 'Vài', '数': 'Số', '多': 'Đa', '半': 'Bán', '两': 'Lưỡng', '兩': 'Lưỡng' } # Helper function to translate a single token def translate_single_token(idx, tok, list_of_tokens): word = tok.word tag = tok.tag # Punctuation is_punct = (tag == 'x' or word in {',', '.', '!', '?', ';', ':', '"', '(', ')', '[', ']', '{', '}'}) if is_punct: has_chinese = False for char in word: if char in self.char_dict: has_chinese = True break if not has_chinese: punct_map = { ',': ',', '。': '.', '「': '"', '」': '"', '、': ',', '?': '?', '!': '!', ':': ':', ';': ';', '“': '"', '”': '"', '(': '(', ')': ')' } tok.translated = punct_map.get(word, word) return # Rule for number + 人 (e.g. 几十人, 三人) if len(word) > 1 and word.endswith('人') and self.is_number(word[:-1]): num_part = word[:-1] if num_part in self.vietphrase: num_trans = self.format_translation(self.vietphrase[num_part], multi_option, num_part) else: num_trans = " ".join([self.char_dict.get(c, c).split("/")[0] for c in num_part]) tok.translated = f"{num_trans} người" return # Special rule for 了 (le vs liao) if word == 'l' or word == '了': is_at_end = True for next_tok in list_of_tokens[idx+1:]: if next_tok.word in {'"', '\'', '(', ')', '[', ']', '{', '}', '“', '”', '‘', '’', '(', ')', '【', '】', '《', '》'}: continue if next_tok.word in {',', '.', '!', '?', ';', ':', ',', '。', '!', '?', ';', ':', '、'}: is_at_end = True break is_at_end = False break if is_at_end: tok.translated = "rồi" else: tok.translated = "được" return # Cultivation Realm (cultivation) if tag == 'cultivation': result = [] for char in word: if char in HANVIET_NUMBERS: result.append(HANVIET_NUMBERS[char]) else: cap_val = self.char_dict.get(char, char).split("/")[0].capitalize() result.append(cap_val) tok.translated = " ".join(result) return # Determine if it's a noun or an adjective is_proper = (tag in {'nr', 'ns', 'nt'} if tag else False) is_noun = (tag.startswith('n') if tag else False) or tag in {'n', 'nz', 'ng'} if tag else False is_adj = tag in {'a', 'b', 'ad', 'an', 'z'} if tag else False is_noun_or_adj = is_proper or is_noun or is_adj # --- Chốt chặn cuối cùng cho Tên riêng (Proper Names Guard) --- if is_proper: if word in self.proper_names: tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) else: if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): tok.translated = word else: result = [] for char in word: val = self.char_dict.get(char, char).split("/")[0] result.append(val) tok.translated = " ".join(result) if tok.translated: tok.translated = self.capitalize_phrase(tok.translated) # --- Translate lookup strategy depending on active_mode (for non-proper names) --- else: if active_mode == 'hanviet': # Mode 4: Pure Hán Việt (NO Vietphrase) if word in self.proper_names: tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) else: if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): tok.translated = word else: result = [] for char in word: val = self.char_dict.get(char, char).split("/")[0] result.append(val) tok.translated = " ".join(result) elif active_mode == 'vietphrase': # Mode 3: Prioritize Vietphrase (Traditional) if word in self.vietphrase: tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False) elif word in self.proper_names: tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) else: if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): tok.translated = word else: result = [] for char in word: val = self.char_dict.get(char, char).split("/")[0] result.append(val) tok.translated = " ".join(result) else: # Modes 1, 2 & 5: 'fast', 'advanced', or 'advanced_hanviet' (POS-based noun/adjective Hán Việt override) if is_noun_or_adj: # Nouns/Adjectives: Bypasses vietphrase if word in self.proper_names: tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) else: if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): tok.translated = word else: result = [] for char in word: val = self.char_dict.get(char, char).split("/")[0] result.append(val) tok.translated = " ".join(result) else: # Verbs and other parts of speech if active_mode == 'advanced_hanviet': # Prefer HanViet dictionary (proper_names) over Vietphrase if word in self.proper_names: tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) elif word in self.vietphrase: tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False) else: if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): tok.translated = word else: result = [] for char in word: val = self.char_dict.get(char, char).split("/")[0] result.append(val) tok.translated = " ".join(result) else: # Standard fast/advanced: vietphrase -> proper_names -> character fallback if word in self.vietphrase: tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False) elif word in self.proper_names: tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) else: if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): tok.translated = word else: result = [] for char in word: val = self.char_dict.get(char, char).split("/")[0] result.append(val) tok.translated = " ".join(result) # Strip trailing "đích" / "Đích" from modifier translations if tok.translated and word.endswith('的') and len(word) > 1: val = tok.translated if val.lower().endswith(' đích'): tok.translated = val[:-5] elif val.lower().endswith('đích'): tok.translated = val[:-4] # Step 1: Group numeral phrases and cultivation terms FIRST grouped = [] i = 0 while i < len(raw_tokens): tok = raw_tokens[i] word = tok.word tag = tok.tag if self.is_number(word) and i + 1 < len(raw_tokens) and raw_tokens[i+1].word in NUM_KEYWORDS: grouped_word = word + raw_tokens[i+1].word i_next = i + 2 if i_next < len(raw_tokens) and raw_tokens[i_next].tag in {'n', 'nr', 'ns', 'nt', 'nz'}: grouped_word += raw_tokens[i_next].word i_next += 1 grouped.append(SimpleToken(grouped_word, 'cultivation')) i = i_next else: grouped.append(SimpleToken(word, tag)) i += 1 # Step 2: Translate individual tokens on the cultivation-grouped tokens for idx, tok in enumerate(grouped): translate_single_token(idx, tok, grouped) # Step 3: Greedy merge adjacent tokens if their combination exists in dictionaries i = 0 merged = [] while i < len(grouped): matched = False for length in range(min(4, len(grouped) - i), 1, -1): combined_word = "".join([grouped[i+k].word for k in range(length)]) # Prevent merging across '的' particle to preserve root Hán Việt translation and allow reordering should_skip = False if 'đích' in combined_word or '的' in combined_word and combined_word.find('的') > 0: should_skip = True elif i + length < len(grouped) and grouped[i+length].word == '的': # If next token is 'de' (de/的), don't merge if it would swallow a pronoun/noun/verb last_tok = grouped[i+length-1] if last_tok.flag in {'r', 'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v'}: should_skip = True elif '是' in combined_word and any(p in combined_word for p in {'我', '你', 'he', 'she', 'it', '们', '您', '自己'}): # Prevent merging copula + pronoun phrases (like '这是他', '那是我') to allow proper clause reordering should_skip = True # Dict check strategy depends on active_mode if active_mode == 'hanviet': in_dicts = (combined_word in self.proper_names) else: in_dicts = (combined_word in self.vietphrase or combined_word in self.proper_names) if not should_skip and in_dicts: combined_tag = None try: cut_res = list(pseg.cut(combined_word)) if cut_res: combined_tag = cut_res[0].flag except Exception: pass if not combined_tag: combined_tag = grouped[i].flag for k in range(length): if grouped[i+k].flag in {'nr', 'ns', 'nt', 'nz'}: combined_tag = grouped[i+k].flag break new_tok = SimpleToken(combined_word, combined_tag) # Translate the new merged token immediately translate_single_token(0, new_tok, [new_tok]) merged.append(new_tok) i += length matched = True break if not matched: merged.append(grouped[i]) i += 1 # Step 4: Reordering Grammar Rules if active_mode != 'hanviet': # Pass 1: Adjective + Noun reordering changed = True while changed: changed = False i = 0 while i < len(merged) - 1: t_a = merged[i] t_n = merged[i+1] # Do not swap with prepositions/conjunctions/copulas/particles if t_n.word in {'跟', '和', '与', '與', '同', '在', '从', '從', '自', '由', '向', '往', '朝', '对', '對', '给', '給', '比', '是', '叫', '让', '讓', '被', '把', '使', '令', '到', '了', '的', '而', '&', '并', '並', '以', '或', '者'}: i += 1 continue if (t_a.tag in {'a', 'b'} or (t_a.word.endswith('的') and t_a.word != '的')) and t_n.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}: combined = t_n.translated + " " + t_a.translated new_tok = SimpleToken(t_a.word + t_n.word, t_n.tag) new_tok.translated = combined merged[i:i+2] = [new_tok] changed = True break i += 1 # Pass 2: "的" reordering (with multi-token noun/verb phrase lookahead) NOUN_PHRASE_TAGS = {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i'} LOOKAHEAD_TAGS = NOUN_PHRASE_TAGS | {'v', 'vd', 'vg', 'vi', 'vn'} i = 1 while i < len(merged) - 1: tok = merged[i] if tok.word in {'de', '的'}: t_x = merged[i-1] # Scan forward to collect all consecutive noun or verb phrase tokens k = i + 1 has_noun = False while k < len(merged): tok_k = merged[k] # Stop collecting if we hit a locality word/orientation noun if tok_k.word in {'下', '上', '中', '里', '外', '内', '內', '后', '後', '前', '旁', '侧', '側', '底', '间', '間'}: break # If we already encountered a noun/verb in the phrase, # we cannot have a subsequent adjective modifying that noun from the right. if has_noun and tok_k.tag in {'a', 'b'}: break # Do not collect a verb tag if we already have a noun/verb head is_verb_tag = tok_k.tag in {'v', 'vd', 'vg', 'vi', 'vn'} if has_noun and is_verb_tag: break if tok_k.tag in LOOKAHEAD_TAGS or tok_k.word == '色': if tok_k.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}: has_noun = True k += 1 else: break is_verb_modifier = t_x.tag in {'v', 'vd', 'vg', 'vi', 'vn'} # If we collected at least one token AND the modifier is not a verb clause if k > i + 1 and not is_verb_modifier: y_tokens = merged[i+1:k] y_translated = " ".join([t.translated for t in y_tokens if t.translated]) y_word = "".join([t.word for t in y_tokens]) if t_x.tag != 'x': start_idx = i - 1 j_back = i - 2 while j_back >= 0: tag_back = merged[j_back].tag if tag_back in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i', 's', 't'}: start_idx = j_back j_back -= 1 else: break modifier_tokens = merged[start_idx:i] modifier_translated = " ".join([t.translated for t in modifier_tokens if t.translated]) modifier_word = "".join([t.word for t in modifier_tokens]) is_proper_or_pronoun = ( t_x.tag in {'nr', 'r'} ) is_noun_modifier = is_proper_or_pronoun and not t_x.word.endswith('色') if is_noun_modifier and start_idx == i - 1: combined = y_translated + " của " + modifier_translated else: combined = y_translated + " " + modifier_translated new_tok = SimpleToken(modifier_word + tok.word + y_word, 'n') new_tok.translated = combined merged[start_idx:k] = [new_tok] continue else: # If we didn't reorder, set the '的' translation to empty string to avoid translating as 'đấy' / 'đích' tok.translated = "" i += 1 # Join words translated_text = " ".join([t.translated for t in merged if t.translated]) # Clean spacing and punctuation translated_text = self.clean_punctuation_spacing(translated_text) # Capitalize sentences sentences = re.split(r'([.!?]\s*)', translated_text) start_idx = 0 if capitalize_first else 1 for idx in range(start_idx, len(sentences)): s = sentences[idx] if s and not s.isspace() and not s[0] in {'.', '!', '?'}: for c_idx, char in enumerate(s): if char.isalpha(): sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:] break return "".join(sentences).strip() def translate_paragraph(self, paragraph, multi_option=False, mode=None): if not paragraph or paragraph.isspace(): return paragraph active_mode = mode or self.translation_mode if active_mode in ('vietphrase', 'hanviet'): # Ultra-fast Trie-based translation path (50M+ characters/minute) trie = self.vietphrase_trie if active_mode == 'vietphrase' else self.hanviet_trie prefer_hanviet = (active_mode == 'hanviet') i = 0 text_length = len(paragraph) result_words = [] while i < text_length: length, translation, priority = trie.search_longest_match(paragraph, i) if length > 0: word = paragraph[i:i+length] formatted = self.format_translation(translation, multi_option, word, prefer_hanviet=prefer_hanviet) # Capitalize if it is a proper name if priority == 1 or word in self.proper_names: formatted = self.capitalize_phrase(formatted) result_words.append(formatted) i += length else: char = paragraph[i] if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', char): punct_map = { ',': ',', '。': '.', '「': '"', '」': '"', '、': ',', '?': '?', '!': '!', ':': ':', ';': ';', '“': '"', '”': '"', '(': '(', ')': ')', '『': '"', '』': '"', '【': '[', '】': ']' } result_words.append(punct_map.get(char, char)) else: val = self.char_dict.get(char, char).split("/")[0] result_words.append(val) i += 1 translated_text = " ".join(result_words) translated_text = self.clean_punctuation_spacing(translated_text) # Sentence Capitalization sentences = re.split(r'([.!?]\s*)', translated_text) for idx in range(len(sentences)): s = sentences[idx] if s and not s.isspace() and not s[0] in {'.', '!', '?'}: for c_idx, char in enumerate(s): if char.isalpha(): sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:] break return "".join(sentences).strip() # For advanced & fast modes, use normal sentence splitting & tokenization sentence_ends = re.compile(r'([。!?!?]+)') parts = sentence_ends.split(paragraph) translated_parts = [] for part in parts: if not part: continue if sentence_ends.match(part): punct_map = { '。': '.', '!': '!', '?': '?', ',': ',' } translated_parts.append(punct_map.get(part, part)) else: translated_parts.append(self.translate_sentence(part, multi_option, mode=mode)) return self.clean_punctuation_spacing("".join(translated_parts)) def translate_text_node(self, text, multi_option=False, mode=None): """ Dich mot text node tu DOM. BAO TOAN HOAN TOAN cau truc: xuong dong \n, khoang trang dau/cuoi tung dong. """ if not text: return text # Tach theo \n truoc -> dich tung dong doc lap -> gop lai lines = text.split('\n') translated_lines = [] for line in lines: leading = re.match(r'^\s*', line).group(0) trailing = re.search(r'\s*$', line).group(0) body = line.strip() if not body: translated_lines.append(line) # dong rong -> giu nguyen elif not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', body): translated_lines.append(line) # khong co chu Han -> giu nguyen else: translated_body = self.translate_paragraph(body, multi_option, mode=mode) translated_lines.append(leading + translated_body + trailing) return '\n'.join(translated_lines) def translate(self, text, multi_option=False, mode=None): return self.translate_text_node(text, multi_option=multi_option, mode=mode)