Spaces:
Running
Running
| import os | |
| import sys | |
| import re | |
| import jieba.posseg as pseg | |
| from backend.config import Config | |
| NUM_RE = re.compile(r'^[0-9一二三四五六七八九十百千万几数多半两]+$') | |
| PUNCT_SET = {',', '.', '!', '?', ';', ':', '"', '\'', '(', ')', '[', ']', '{', '}', | |
| ',', '。', '!', '?', ';', ':', '“', '”', '‘', '’', '(', ')', '【', '】', '《', '》', '、', '—', '~'} | |
| class SimpleToken: | |
| def __init__(self, word, tag): | |
| self.word = word | |
| self.tag = tag | |
| self.translated = None | |
| def flag(self): | |
| return self.tag | |
| def flag(self, value): | |
| self.tag = value | |
| class VietphraseEngine: | |
| def __init__(self, config=None): | |
| self.config = config or {} | |
| self.load_dictionaries() | |
| # Check translation mode | |
| self.translation_mode = self.config.get("translation", {}).get("mode", "advanced") | |
| if self.config.get("translation", {}).get("fast_mode", False): | |
| self.translation_mode = "fast" | |
| # Warm-up jieba | |
| import jieba | |
| try: | |
| # Fix segmenter splitting overlapping words like 重生于 | |
| jieba.add_word("重生", tag="v") | |
| jieba.suggest_freq(("生", "于"), True) | |
| jieba.suggest_freq(("着", "重"), True) | |
| jieba.suggest_freq(("醉", "人"), True) | |
| # Tag grades as nouns instead of proper names (nr) | |
| jieba.add_word("高一", tag="n") | |
| jieba.add_word("高二", tag="n") | |
| jieba.add_word("高三", tag="n") | |
| except Exception as e: | |
| print("Error initializing custom word splits in Jieba:", e) | |
| # Always initialize both tokenizers to support dynamic mode switching | |
| self.jieba_tokenizer = jieba.dt | |
| self.pseg_dict = pseg.dt.word_tag_tab | |
| list(self.jieba_tokenizer.cut("暖洋洋")) | |
| list(pseg.cut("暖洋洋")) | |
| def load_dictionaries(self): | |
| paths = self.config.get("paths", {}).get("dictionaries", {}) | |
| vp_path = paths.get("vietphrase", "") | |
| if not vp_path or not os.path.isabs(vp_path): | |
| vp_path = os.path.join(Config.ROOT_DIR, vp_path or "dictionaries/Vietphrase.txt") | |
| dict_dir = os.path.dirname(vp_path) | |
| # Check for encrypted .bin dictionaries first, then fallback to .txt | |
| def load_file_content(base_name): | |
| bin_file = os.path.join(dict_dir, base_name + ".bin") | |
| txt_file = os.path.join(dict_dir, base_name + ".txt") | |
| if os.path.exists(bin_file): | |
| # Decrypt XOR | |
| with open(bin_file, "rb") as f: | |
| data = f.read() | |
| key_bytes = "quick_translator_secret_key_2026".encode("utf-8") | |
| key_len = len(key_bytes) | |
| repeated_key = (key_bytes * (len(data) // key_len + 1))[:len(data)] | |
| decrypted = bytes(a ^ b for a, b in zip(data, repeated_key)) | |
| return decrypted.decode("utf-8") | |
| elif os.path.exists(txt_file): | |
| with open(txt_file, "r", encoding="utf-8") as f: | |
| return f.read() | |
| return "" | |
| print("Loading dictionaries in VietphraseEngine...") | |
| self.char_dict = self.parse_dict_content(load_file_content("HanViet_CharDict")) | |
| # --- ADD HÁn Nôm FALLBACK --- | |
| import csv | |
| han_csv_path = os.path.join(dict_dir, "han_all_readings.csv") | |
| if os.path.exists(han_csv_path): | |
| try: | |
| with open(han_csv_path, 'r', encoding='utf-8') as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| char = row.get("Ký_tự", "").strip() | |
| hv = row.get("Hán_Việt", "").strip() | |
| if char and hv and char not in self.char_dict: | |
| self.char_dict[char] = hv.replace("~", "") | |
| print("Loaded han_all_readings.csv as fallback for missing Chinese characters.") | |
| except Exception as e: | |
| print("Could not load han_all_readings.csv:", e) | |
| self.proper_names = self.parse_dict_content(load_file_content("Aligned_HanViet"), convert_to_simplified=True) | |
| vp_content = load_file_content("Vietphrase") | |
| self.vietphrase = self.parse_vietphrase_content(vp_content) | |
| print("Dictionaries loaded successfully.") | |
| # Build Tries for vietphrase and hanviet modes | |
| from .trie import Trie | |
| print("Building Tries for fast translation modes...") | |
| self.vietphrase_trie = Trie() | |
| # Insert proper names (priority 1) | |
| for k, v in self.proper_names.items(): | |
| self.vietphrase_trie.insert(k, v, 1) | |
| # Insert Vietphrase (priority 2 - higher) | |
| for k, v in self.vietphrase.items(): | |
| self.vietphrase_trie.insert(k, v, 2) | |
| self.hanviet_trie = Trie() | |
| # Insert proper names (priority 2) | |
| for k, v in self.proper_names.items(): | |
| self.hanviet_trie.insert(k, v, 2) | |
| print("Tries built successfully.") | |
| # Register proper names in Jieba dictionary for fast modes | |
| import jieba | |
| for name in self.proper_names: | |
| jieba.add_word(name) | |
| def parse_dict_content(self, content, convert_to_simplified=False): | |
| dictionary = {} | |
| if content: | |
| to_simplified = lambda s: s | |
| if convert_to_simplified: | |
| try: | |
| from hanziconv import HanziConv | |
| to_simplified = HanziConv.toSimplified | |
| except ImportError: | |
| pass | |
| for line in content.splitlines(): | |
| line = line.strip() | |
| if not line or "=" not in line or line.startswith('#'): | |
| continue | |
| parts = line.split("=", 1) | |
| key = parts[0].strip() | |
| val = self.clean_annotation(parts[1].strip()) | |
| dictionary[to_simplified(key)] = val | |
| return dictionary | |
| def parse_vietphrase_content(self, content): | |
| dictionary = {} | |
| if content: | |
| for line in content.splitlines(): | |
| line = line.strip() | |
| if not line or "=" not in line or line.startswith('#'): | |
| continue | |
| parts = line.split("=", 1) | |
| left = parts[0].strip() | |
| right = self.clean_annotation(parts[1].strip()) | |
| if "," in left and "," in right: | |
| keys = [k.strip() for k in left.split(",") if k.strip()] | |
| vals = [v.strip() for v in right.split(",") if v.strip()] | |
| if len(keys) == len(vals): | |
| for k, v in zip(keys, vals): | |
| dictionary[k] = v | |
| continue | |
| if left: | |
| dictionary[left] = right | |
| return dictionary | |
| def is_number(self, word): | |
| return bool(re.match(r'^[0-9一二三四五六七八九十百千万几数多半两]+$', word)) | |
| def capitalize_phrase(self, phrase): | |
| chars = 'a-zA-ZàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđĐ' | |
| pattern = f'[{chars}]+' | |
| return re.sub(pattern, lambda m: m.group(0).capitalize(), phrase) | |
| def clean_annotation(self, text, mode='vietphrase'): | |
| if not text: | |
| return "" | |
| # 1. Parse curly braces {meaning:reading} | |
| def repl_curly(match): | |
| content = match.group(1) | |
| if ':' in content: | |
| parts = content.split(':', 1) | |
| return parts[0].strip() if mode == 'vietphrase' else parts[1].strip() | |
| return content.strip() | |
| text = re.sub(r'\{([^{}]+)\}', repl_curly, text) | |
| # 2. Strip (*...) annotations | |
| text = re.sub(r'\s*\(\*[^)]*\)', '', text) | |
| return text.strip() | |
| def format_translation(self, raw_value, multi_option, word=None, prefer_hanviet=False): | |
| if not raw_value: | |
| return "" | |
| options = [o for o in raw_value.split("/") if o.strip()] | |
| # Deduplicate options while preserving order | |
| seen = set() | |
| deduped = [] | |
| for o in options: | |
| if o not in seen: | |
| seen.add(o) | |
| deduped.append(o) | |
| if not deduped: | |
| return "" | |
| if multi_option and len(deduped) > 1: | |
| return f"{deduped[0]}[{'/'.join(deduped[1:])}]" | |
| # If multi-option is False, we have a word of length >= 2, and prefer_hanviet is True, prefer Hán Việt alignment | |
| if prefer_hanviet and word and len(word) >= 2 and len(deduped) > 1: | |
| hv_sets = [] | |
| for char in word: | |
| readings = set() | |
| if char in self.char_dict: | |
| for r in self.char_dict[char].split('/'): | |
| r_clean = r.strip().lower() | |
| if r_clean: | |
| readings.add(r_clean) | |
| if readings: | |
| hv_sets.append(readings) | |
| best_option = deduped[0] | |
| best_score = -1 | |
| for opt in deduped: | |
| opt_syllables = [w.strip().lower() for w in opt.split() if w.strip()] | |
| score = 0 | |
| for r_set in hv_sets: | |
| if any(r in opt_syllables for r in r_set): | |
| score += 1 | |
| if score > best_score: | |
| best_score = score | |
| best_option = opt | |
| if best_score > 0: | |
| return best_option | |
| return deduped[0] | |
| def clean_punctuation_spacing(self, text): | |
| if not text: | |
| return text | |
| # 1. Ensure exactly one space after commas, semicolons, colons, periods, question marks, and exclamation marks. | |
| # Avoid inserting space if the next character is a closing bracket, closing quote, space, or another punctuation. | |
| text = re.sub(r'([,;.:!?])(?=[^\s)\]}』】”"’])', r'\1 ', text) | |
| # 2. Remove any accidental whitespace before these punctuation marks | |
| text = re.sub(r'\s+([,;.:!?])', r'\1', text) | |
| # 3. Clean spaces inside parentheses, brackets, and curly/double brackets (including Chinese quote styles) | |
| text = re.sub(r'([(\[{『【«])\s+', r'\1', text) | |
| text = re.sub(r'\s+([)\]}』】»])', r'\1', text) | |
| # Ensure a space exists before opening brackets and after closing brackets when they border words/digits | |
| text = re.sub(r'(?<=[^\s(\[{『【«])([(\[{『【«])', r' \1', text) | |
| text = re.sub(r'([)\]}』】»])(?=[^\s.,;:!?)\]}』】»])', r'\1 ', text) | |
| # 4. Standardize dashes/hyphens used as separators (e.g. "Artist - Song") to have one space on each side | |
| text = re.sub(r'\s*-\s*', ' - ', text) | |
| # 5. Clean up any duplicated/trailing whitespaces | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def translate_sentence(self, sentence, multi_option=False, mode=None): | |
| if not sentence or sentence.isspace(): | |
| return "" | |
| # If the sentence doesn't contain any Chinese characters or symbols, preserve it as-is | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', sentence): | |
| return sentence | |
| # Segment into Chinese text blocks and non-Chinese text blocks | |
| # Keep Chinese characters and Chinese specific punctuations in the translation segment | |
| chinese_pattern = re.compile(r'([\u4e00-\u9fff\u3000-\u303f\uff00-\uffef]+)') | |
| parts = chinese_pattern.split(sentence) | |
| # Merge simple alphanumeric non-Chinese blocks into adjacent Chinese blocks | |
| i = 1 | |
| while i < len(parts) - 1: | |
| non_chinese = parts[i+1] | |
| if re.match(r'^\s*[a-zA-Z0-9]+\s*$', non_chinese): | |
| parts[i] = parts[i] + non_chinese + parts[i+2] | |
| parts.pop(i+1) | |
| parts.pop(i+1) | |
| else: | |
| i += 2 | |
| translated_parts = [] | |
| capitalize_next = True | |
| for part in parts: | |
| if not part: | |
| continue | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df\u3000-\u303f\uff00-\uffef]', part): | |
| # Non-Chinese segment -> preserve exactly | |
| translated_parts.append(part) | |
| # Check if it ends with sentence terminator | |
| if re.search(r'[.!?]\s*$', part): | |
| capitalize_next = True | |
| elif part.strip(): | |
| capitalize_next = False | |
| else: | |
| # Chinese segment -> translate | |
| trans = self._translate_pure_chinese_sentence(part, multi_option, mode, capitalize_first=capitalize_next) | |
| translated_parts.append(trans) | |
| # Check if it ends with sentence terminator | |
| if re.search(r'[.!?]\s*$', part) or re.search(r'[.!?]\s*$', trans): | |
| capitalize_next = True | |
| else: | |
| capitalize_next = False | |
| return "".join(translated_parts) | |
| def _translate_pure_chinese_sentence(self, sentence, multi_option=False, mode=None, capitalize_first=True): | |
| if not sentence or sentence.isspace(): | |
| return "" | |
| active_mode = mode or self.translation_mode | |
| # Tokenization & Tagging depending on mode | |
| if active_mode in ("advanced", "advanced_hanviet"): | |
| raw_tokens = [SimpleToken(t.word, t.flag) for t in pseg.cut(sentence)] | |
| else: | |
| # "fast", "vietphrase", "hanviet" modes use the fast tokenizer | |
| words = list(self.jieba_tokenizer.cut(sentence)) | |
| raw_tokens = [] | |
| for w in words: | |
| if w in PUNCT_SET: | |
| tag = 'x' | |
| elif NUM_RE.match(w): | |
| tag = 'm' | |
| else: | |
| tag = self.pseg_dict.get(w, 'n') | |
| raw_tokens.append(SimpleToken(w, tag)) | |
| if not raw_tokens: | |
| return "" | |
| NUM_KEYWORDS = {"重", "阶", "品", "级", "层", "剑", "星", "转", "天", "色", "关", "重天"} | |
| HANVIET_NUMBERS = { | |
| '0': 'Không', '1': 'Nhất', '2': 'Nhị', '3': 'Tam', '4': 'Tứ', '5': 'Ngũ', '6': 'Lục', '7': 'Thất', '8': 'Bát', '9': 'Cửu', '10': 'Thập', | |
| '一': 'Nhất', '二': 'Nhị', '三': 'Tam', '四': 'Tứ', '五': 'Ngũ', '六': 'Lục', '七': 'Thất', '八': 'Bát', '九': 'Cửu', '十': 'Thập', | |
| '百': 'Bách', '千': 'Thiên', '万': 'Vạn', '萬': 'Vạn', '几': 'Vài', '数': 'Số', '多': 'Đa', '半': 'Bán', '两': 'Lưỡng', '兩': 'Lưỡng' | |
| } | |
| # Helper function to translate a single token | |
| def translate_single_token(idx, tok, list_of_tokens): | |
| word = tok.word | |
| tag = tok.tag | |
| # Punctuation | |
| is_punct = (tag == 'x' or word in {',', '.', '!', '?', ';', ':', '"', '(', ')', '[', ']', '{', '}'}) | |
| if is_punct: | |
| has_chinese = False | |
| for char in word: | |
| if char in self.char_dict: | |
| has_chinese = True | |
| break | |
| if not has_chinese: | |
| punct_map = { | |
| ',': ',', '。': '.', '「': '"', '」': '"', '、': ',', '?': '?', '!': '!', | |
| ':': ':', ';': ';', '“': '"', '”': '"', '(': '(', ')': ')' | |
| } | |
| tok.translated = punct_map.get(word, word) | |
| return | |
| # Rule for number + 人 (e.g. 几十人, 三人) | |
| if len(word) > 1 and word.endswith('人') and self.is_number(word[:-1]): | |
| num_part = word[:-1] | |
| if num_part in self.vietphrase: | |
| num_trans = self.format_translation(self.vietphrase[num_part], multi_option, num_part) | |
| else: | |
| num_trans = " ".join([self.char_dict.get(c, c).split("/")[0] for c in num_part]) | |
| tok.translated = f"{num_trans} người" | |
| return | |
| # Special rule for 了 (le vs liao) | |
| if word == 'l' or word == '了': | |
| is_at_end = True | |
| for next_tok in list_of_tokens[idx+1:]: | |
| if next_tok.word in {'"', '\'', '(', ')', '[', ']', '{', '}', '“', '”', '‘', '’', '(', ')', '【', '】', '《', '》'}: | |
| continue | |
| if next_tok.word in {',', '.', '!', '?', ';', ':', ',', '。', '!', '?', ';', ':', '、'}: | |
| is_at_end = True | |
| break | |
| is_at_end = False | |
| break | |
| if is_at_end: | |
| tok.translated = "rồi" | |
| else: | |
| tok.translated = "được" | |
| return | |
| # Cultivation Realm (cultivation) | |
| if tag == 'cultivation': | |
| result = [] | |
| for char in word: | |
| if char in HANVIET_NUMBERS: | |
| result.append(HANVIET_NUMBERS[char]) | |
| else: | |
| cap_val = self.char_dict.get(char, char).split("/")[0].capitalize() | |
| result.append(cap_val) | |
| tok.translated = " ".join(result) | |
| return | |
| # Determine if it's a noun or an adjective | |
| is_proper = (tag in {'nr', 'ns', 'nt'} if tag else False) | |
| is_noun = (tag.startswith('n') if tag else False) or tag in {'n', 'nz', 'ng'} if tag else False | |
| is_adj = tag in {'a', 'b', 'ad', 'an', 'z'} if tag else False | |
| is_noun_or_adj = is_proper or is_noun or is_adj | |
| # --- Chốt chặn cuối cùng cho Tên riêng (Proper Names Guard) --- | |
| if is_proper: | |
| if word in self.proper_names: | |
| tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) | |
| else: | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): | |
| tok.translated = word | |
| else: | |
| result = [] | |
| for char in word: | |
| val = self.char_dict.get(char, char).split("/")[0] | |
| result.append(val) | |
| tok.translated = " ".join(result) | |
| if tok.translated: | |
| tok.translated = self.capitalize_phrase(tok.translated) | |
| # --- Translate lookup strategy depending on active_mode (for non-proper names) --- | |
| else: | |
| if active_mode == 'hanviet': | |
| # Mode 4: Pure Hán Việt (NO Vietphrase) | |
| if word in self.proper_names: | |
| tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) | |
| else: | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): | |
| tok.translated = word | |
| else: | |
| result = [] | |
| for char in word: | |
| val = self.char_dict.get(char, char).split("/")[0] | |
| result.append(val) | |
| tok.translated = " ".join(result) | |
| elif active_mode == 'vietphrase': | |
| # Mode 3: Prioritize Vietphrase (Traditional) | |
| if word in self.vietphrase: | |
| tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False) | |
| elif word in self.proper_names: | |
| tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) | |
| else: | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): | |
| tok.translated = word | |
| else: | |
| result = [] | |
| for char in word: | |
| val = self.char_dict.get(char, char).split("/")[0] | |
| result.append(val) | |
| tok.translated = " ".join(result) | |
| else: | |
| # Modes 1, 2 & 5: 'fast', 'advanced', or 'advanced_hanviet' (POS-based noun/adjective Hán Việt override) | |
| if is_noun_or_adj: | |
| # Nouns/Adjectives: Bypasses vietphrase | |
| if word in self.proper_names: | |
| tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) | |
| else: | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): | |
| tok.translated = word | |
| else: | |
| result = [] | |
| for char in word: | |
| val = self.char_dict.get(char, char).split("/")[0] | |
| result.append(val) | |
| tok.translated = " ".join(result) | |
| else: | |
| # Verbs and other parts of speech | |
| if active_mode == 'advanced_hanviet': | |
| # Prefer HanViet dictionary (proper_names) over Vietphrase | |
| if word in self.proper_names: | |
| tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) | |
| elif word in self.vietphrase: | |
| tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False) | |
| else: | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): | |
| tok.translated = word | |
| else: | |
| result = [] | |
| for char in word: | |
| val = self.char_dict.get(char, char).split("/")[0] | |
| result.append(val) | |
| tok.translated = " ".join(result) | |
| else: | |
| # Standard fast/advanced: vietphrase -> proper_names -> character fallback | |
| if word in self.vietphrase: | |
| tok.translated = self.format_translation(self.vietphrase[word], multi_option, word, prefer_hanviet=False) | |
| elif word in self.proper_names: | |
| tok.translated = self.format_translation(self.proper_names[word], multi_option, word, prefer_hanviet=True) | |
| else: | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', word): | |
| tok.translated = word | |
| else: | |
| result = [] | |
| for char in word: | |
| val = self.char_dict.get(char, char).split("/")[0] | |
| result.append(val) | |
| tok.translated = " ".join(result) | |
| # Strip trailing "đích" / "Đích" from modifier translations | |
| if tok.translated and word.endswith('的') and len(word) > 1: | |
| val = tok.translated | |
| if val.lower().endswith(' đích'): | |
| tok.translated = val[:-5] | |
| elif val.lower().endswith('đích'): | |
| tok.translated = val[:-4] | |
| # Step 1: Group numeral phrases and cultivation terms FIRST | |
| grouped = [] | |
| i = 0 | |
| while i < len(raw_tokens): | |
| tok = raw_tokens[i] | |
| word = tok.word | |
| tag = tok.tag | |
| if self.is_number(word) and i + 1 < len(raw_tokens) and raw_tokens[i+1].word in NUM_KEYWORDS: | |
| grouped_word = word + raw_tokens[i+1].word | |
| i_next = i + 2 | |
| if i_next < len(raw_tokens) and raw_tokens[i_next].tag in {'n', 'nr', 'ns', 'nt', 'nz'}: | |
| grouped_word += raw_tokens[i_next].word | |
| i_next += 1 | |
| grouped.append(SimpleToken(grouped_word, 'cultivation')) | |
| i = i_next | |
| else: | |
| grouped.append(SimpleToken(word, tag)) | |
| i += 1 | |
| # Step 2: Translate individual tokens on the cultivation-grouped tokens | |
| for idx, tok in enumerate(grouped): | |
| translate_single_token(idx, tok, grouped) | |
| # Step 3: Greedy merge adjacent tokens if their combination exists in dictionaries | |
| i = 0 | |
| merged = [] | |
| while i < len(grouped): | |
| matched = False | |
| for length in range(min(4, len(grouped) - i), 1, -1): | |
| combined_word = "".join([grouped[i+k].word for k in range(length)]) | |
| # Prevent merging across '的' particle to preserve root Hán Việt translation and allow reordering | |
| should_skip = False | |
| if 'đích' in combined_word or '的' in combined_word and combined_word.find('的') > 0: | |
| should_skip = True | |
| elif i + length < len(grouped) and grouped[i+length].word == '的': | |
| # If next token is 'de' (de/的), don't merge if it would swallow a pronoun/noun/verb | |
| last_tok = grouped[i+length-1] | |
| if last_tok.flag in {'r', 'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v'}: | |
| should_skip = True | |
| elif '是' in combined_word and any(p in combined_word for p in {'我', '你', 'he', 'she', 'it', '们', '您', '自己'}): | |
| # Prevent merging copula + pronoun phrases (like '这是他', '那是我') to allow proper clause reordering | |
| should_skip = True | |
| # Dict check strategy depends on active_mode | |
| if active_mode == 'hanviet': | |
| in_dicts = (combined_word in self.proper_names) | |
| else: | |
| in_dicts = (combined_word in self.vietphrase or combined_word in self.proper_names) | |
| if not should_skip and in_dicts: | |
| combined_tag = None | |
| try: | |
| cut_res = list(pseg.cut(combined_word)) | |
| if cut_res: | |
| combined_tag = cut_res[0].flag | |
| except Exception: | |
| pass | |
| if not combined_tag: | |
| combined_tag = grouped[i].flag | |
| for k in range(length): | |
| if grouped[i+k].flag in {'nr', 'ns', 'nt', 'nz'}: | |
| combined_tag = grouped[i+k].flag | |
| break | |
| new_tok = SimpleToken(combined_word, combined_tag) | |
| # Translate the new merged token immediately | |
| translate_single_token(0, new_tok, [new_tok]) | |
| merged.append(new_tok) | |
| i += length | |
| matched = True | |
| break | |
| if not matched: | |
| merged.append(grouped[i]) | |
| i += 1 | |
| # Step 4: Reordering Grammar Rules | |
| if active_mode != 'hanviet': | |
| # Pass 1: Adjective + Noun reordering | |
| changed = True | |
| while changed: | |
| changed = False | |
| i = 0 | |
| while i < len(merged) - 1: | |
| t_a = merged[i] | |
| t_n = merged[i+1] | |
| # Do not swap with prepositions/conjunctions/copulas/particles | |
| if t_n.word in {'跟', '和', '与', '與', '同', '在', '从', '從', '自', '由', '向', '往', '朝', '对', '對', '给', '給', '比', '是', '叫', '让', '讓', '被', '把', '使', '令', '到', '了', '的', '而', '&', '并', '並', '以', '或', '者'}: | |
| i += 1 | |
| continue | |
| if (t_a.tag in {'a', 'b'} or (t_a.word.endswith('的') and t_a.word != '的')) and t_n.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}: | |
| combined = t_n.translated + " " + t_a.translated | |
| new_tok = SimpleToken(t_a.word + t_n.word, t_n.tag) | |
| new_tok.translated = combined | |
| merged[i:i+2] = [new_tok] | |
| changed = True | |
| break | |
| i += 1 | |
| # Pass 2: "的" reordering (with multi-token noun/verb phrase lookahead) | |
| NOUN_PHRASE_TAGS = {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i'} | |
| LOOKAHEAD_TAGS = NOUN_PHRASE_TAGS | {'v', 'vd', 'vg', 'vi', 'vn'} | |
| i = 1 | |
| while i < len(merged) - 1: | |
| tok = merged[i] | |
| if tok.word in {'de', '的'}: | |
| t_x = merged[i-1] | |
| # Scan forward to collect all consecutive noun or verb phrase tokens | |
| k = i + 1 | |
| has_noun = False | |
| while k < len(merged): | |
| tok_k = merged[k] | |
| # Stop collecting if we hit a locality word/orientation noun | |
| if tok_k.word in {'下', '上', '中', '里', '外', '内', '內', '后', '後', '前', '旁', '侧', '側', '底', '间', '間'}: | |
| break | |
| # If we already encountered a noun/verb in the phrase, | |
| # we cannot have a subsequent adjective modifying that noun from the right. | |
| if has_noun and tok_k.tag in {'a', 'b'}: | |
| break | |
| # Do not collect a verb tag if we already have a noun/verb head | |
| is_verb_tag = tok_k.tag in {'v', 'vd', 'vg', 'vi', 'vn'} | |
| if has_noun and is_verb_tag: | |
| break | |
| if tok_k.tag in LOOKAHEAD_TAGS or tok_k.word == '色': | |
| if tok_k.tag in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'v', 'vd', 'vg', 'vi', 'vn'}: | |
| has_noun = True | |
| k += 1 | |
| else: | |
| break | |
| is_verb_modifier = t_x.tag in {'v', 'vd', 'vg', 'vi', 'vn'} | |
| # If we collected at least one token AND the modifier is not a verb clause | |
| if k > i + 1 and not is_verb_modifier: | |
| y_tokens = merged[i+1:k] | |
| y_translated = " ".join([t.translated for t in y_tokens if t.translated]) | |
| y_word = "".join([t.word for t in y_tokens]) | |
| if t_x.tag != 'x': | |
| start_idx = i - 1 | |
| j_back = i - 2 | |
| while j_back >= 0: | |
| tag_back = merged[j_back].tag | |
| if tag_back in {'n', 'nr', 'ns', 'nt', 'nz', 'ng', 'a', 'b', 'm', 'q', 'j', 'i', 's', 't'}: | |
| start_idx = j_back | |
| j_back -= 1 | |
| else: | |
| break | |
| modifier_tokens = merged[start_idx:i] | |
| modifier_translated = " ".join([t.translated for t in modifier_tokens if t.translated]) | |
| modifier_word = "".join([t.word for t in modifier_tokens]) | |
| is_proper_or_pronoun = ( | |
| t_x.tag in {'nr', 'r'} | |
| ) | |
| is_noun_modifier = is_proper_or_pronoun and not t_x.word.endswith('色') | |
| if is_noun_modifier and start_idx == i - 1: | |
| combined = y_translated + " của " + modifier_translated | |
| else: | |
| combined = y_translated + " " + modifier_translated | |
| new_tok = SimpleToken(modifier_word + tok.word + y_word, 'n') | |
| new_tok.translated = combined | |
| merged[start_idx:k] = [new_tok] | |
| continue | |
| else: | |
| # If we didn't reorder, set the '的' translation to empty string to avoid translating as 'đấy' / 'đích' | |
| tok.translated = "" | |
| i += 1 | |
| # Join words | |
| translated_text = " ".join([t.translated for t in merged if t.translated]) | |
| # Clean spacing and punctuation | |
| translated_text = self.clean_punctuation_spacing(translated_text) | |
| # Capitalize sentences | |
| sentences = re.split(r'([.!?]\s*)', translated_text) | |
| start_idx = 0 if capitalize_first else 1 | |
| for idx in range(start_idx, len(sentences)): | |
| s = sentences[idx] | |
| if s and not s.isspace() and not s[0] in {'.', '!', '?'}: | |
| for c_idx, char in enumerate(s): | |
| if char.isalpha(): | |
| sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:] | |
| break | |
| return "".join(sentences).strip() | |
| def translate_paragraph(self, paragraph, multi_option=False, mode=None): | |
| if not paragraph or paragraph.isspace(): | |
| return paragraph | |
| active_mode = mode or self.translation_mode | |
| if active_mode in ('vietphrase', 'hanviet'): | |
| # Ultra-fast Trie-based translation path (50M+ characters/minute) | |
| trie = self.vietphrase_trie if active_mode == 'vietphrase' else self.hanviet_trie | |
| prefer_hanviet = (active_mode == 'hanviet') | |
| i = 0 | |
| text_length = len(paragraph) | |
| result_words = [] | |
| while i < text_length: | |
| length, translation, priority = trie.search_longest_match(paragraph, i) | |
| if length > 0: | |
| word = paragraph[i:i+length] | |
| formatted = self.format_translation(translation, multi_option, word, prefer_hanviet=prefer_hanviet) | |
| # Capitalize if it is a proper name | |
| if priority == 1 or word in self.proper_names: | |
| formatted = self.capitalize_phrase(formatted) | |
| result_words.append(formatted) | |
| i += length | |
| else: | |
| char = paragraph[i] | |
| if not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', char): | |
| punct_map = { | |
| ',': ',', '。': '.', '「': '"', '」': '"', '、': ',', '?': '?', '!': '!', | |
| ':': ':', ';': ';', '“': '"', '”': '"', '(': '(', ')': ')', | |
| '『': '"', '』': '"', '【': '[', '】': ']' | |
| } | |
| result_words.append(punct_map.get(char, char)) | |
| else: | |
| val = self.char_dict.get(char, char).split("/")[0] | |
| result_words.append(val) | |
| i += 1 | |
| translated_text = " ".join(result_words) | |
| translated_text = self.clean_punctuation_spacing(translated_text) | |
| # Sentence Capitalization | |
| sentences = re.split(r'([.!?]\s*)', translated_text) | |
| for idx in range(len(sentences)): | |
| s = sentences[idx] | |
| if s and not s.isspace() and not s[0] in {'.', '!', '?'}: | |
| for c_idx, char in enumerate(s): | |
| if char.isalpha(): | |
| sentences[idx] = s[:c_idx] + char.upper() + s[c_idx+1:] | |
| break | |
| return "".join(sentences).strip() | |
| # For advanced & fast modes, use normal sentence splitting & tokenization | |
| sentence_ends = re.compile(r'([。!?!?]+)') | |
| parts = sentence_ends.split(paragraph) | |
| translated_parts = [] | |
| for part in parts: | |
| if not part: | |
| continue | |
| if sentence_ends.match(part): | |
| punct_map = { | |
| '。': '.', '!': '!', '?': '?', ',': ',' | |
| } | |
| translated_parts.append(punct_map.get(part, part)) | |
| else: | |
| translated_parts.append(self.translate_sentence(part, multi_option, mode=mode)) | |
| return self.clean_punctuation_spacing("".join(translated_parts)) | |
| def translate_text_node(self, text, multi_option=False, mode=None): | |
| """ | |
| Dich mot text node tu DOM. | |
| BAO TOAN HOAN TOAN cau truc: xuong dong \n, khoang trang dau/cuoi tung dong. | |
| """ | |
| if not text: | |
| return text | |
| # Tach theo \n truoc -> dich tung dong doc lap -> gop lai | |
| lines = text.split('\n') | |
| translated_lines = [] | |
| for line in lines: | |
| leading = re.match(r'^\s*', line).group(0) | |
| trailing = re.search(r'\s*$', line).group(0) | |
| body = line.strip() | |
| if not body: | |
| translated_lines.append(line) # dong rong -> giu nguyen | |
| elif not re.search(r'[\u3400-\u9fff\U00020000-\U0002a6df]', body): | |
| translated_lines.append(line) # khong co chu Han -> giu nguyen | |
| else: | |
| translated_body = self.translate_paragraph(body, multi_option, mode=mode) | |
| translated_lines.append(leading + translated_body + trailing) | |
| return '\n'.join(translated_lines) | |
| def translate(self, text, multi_option=False, mode=None): | |
| return self.translate_text_node(text, multi_option=multi_option, mode=mode) | |