import re import unicodedata from transformers import AutoTokenizer from . import punctuation, symbols # Vietnamese BERT model model_id = 'vinai/phobert-base-v2' tokenizer = None def get_tokenizer(): global tokenizer if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(model_id) return tokenizer # Vietnamese IPA phoneme set based on VieNeu-TTS-140h dataset # These are extracted from the phonemized_text field in the dataset VI_IPA_CONSONANTS = [ 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'w', 'x', 'z', 'ŋ', # ng 'ɲ', # nh 'ʈ', # tr 'ɖ', # đ 'tʰ', # th 'kʰ', # kh 'ʂ', # s (southern) 'ɣ', # g (southern) 'χ', # x (some dialects) ] VI_IPA_VOWELS = [ 'a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'ơ', 'u', 'ư', 'y', 'ə', # ơ 'ɛ', # e 'ɔ', # o 'ɯ', # ư 'ɤ', # ơ variant 'ɐ', # a short 'ʊ', # u short 'ɪ', # i short 'ʌ', # â 'æ', # a variant ] # Vietnamese tone markers (numbers 1-6 or ˈ ˌ for stress) VI_TONE_MARKERS = ['1', '2', '3', '4', '5', '6', 'ˈ', 'ˌ', 'ː'] # Combined IPA symbols used in VieNeu-TTS dataset VI_IPA_SYMBOLS = [ # Consonants 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'w', 'x', 'z', 'ŋ', 'ɲ', 'ʈ', 'ɖ', 'ʂ', 'ɣ', 'χ', 'ʔ', # Vowels 'a', 'ă', 'e', 'i', 'o', 'u', 'y', 'ə', 'ɛ', 'ɔ', 'ɯ', 'ɤ', 'ɐ', 'ʊ', 'ɪ', 'ʌ', 'æ', 'ɑ', # Special markers 'ˈ', 'ˌ', 'ː', # Tone numbers '1', '2', '3', '4', '5', '6', ] def normalize_vietnamese_text(text): """Normalize Vietnamese text.""" # Normalize unicode text = unicodedata.normalize('NFC', text) # Remove extra whitespace text = re.sub(r'\s+', ' ', text) text = text.strip() # Convert numbers to words (basic) text = convert_numbers_to_vietnamese(text) return text def convert_numbers_to_vietnamese(text): """Convert numbers to Vietnamese words (basic implementation).""" num_map = { '0': 'không', '1': 'một', '2': 'hai', '3': 'ba', '4': 'bốn', '5': 'năm', '6': 'sáu', '7': 'bảy', '8': 'tám', '9': 'chín', '10': 'mười', '100': 'trăm', '1000': 'nghìn' } # Simple replacement for single digits in context def replace_num(match): num = match.group(0) if num in num_map: return num_map[num] return num # Only replace standalone numbers text = re.sub(r'\b\d\b', replace_num, text) return text def text_normalize(text): """Normalize text for Vietnamese TTS.""" text = normalize_vietnamese_text(text) return text def parse_ipa_phonemes(phonemized_text): """ Parse IPA phonemized text from VieNeu-TTS dataset. Example: "ŋˈyə2j ŋˈyə2j bˈan xwˈan vˈe2" Returns: phones, tones, word2ph """ phones = [] tones = [] word2ph = [] # Split by space to get words words = phonemized_text.strip().split() for word in words: word_phones = [] word_tones = [] # Parse each character/symbol in the word i = 0 current_tone = 0 # Default tone (neutral/tone 1) while i < len(word): char = word[i] # Check for tone numbers (1-6) if char.isdigit(): current_tone = int(char) i += 1 continue # Check for stress markers if char in ['ˈ', 'ˌ']: # Primary or secondary stress - could be used as tone variant i += 1 continue # Check for length marker if char == 'ː': # Long vowel marker - append to previous phone if exists if word_phones: word_phones[-1] = word_phones[-1] + 'ː' i += 1 continue # Check for punctuation if char in punctuation: if word_phones: phones.extend(word_phones) tones.extend([current_tone] * len(word_phones)) word2ph.append(len(word_phones)) word_phones = [] word_tones = [] phones.append(char) tones.append(0) word2ph.append(1) i += 1 continue # Regular phoneme word_phones.append(char) i += 1 # Apply collected tone to all phones in this word if word_phones: phones.extend(word_phones) tones.extend([current_tone] * len(word_phones)) word2ph.append(len(word_phones)) return phones, tones, word2ph def g2p_ipa(text): """ Convert text to phonemes using external IPA converter. This is a fallback for when phonemized_text is not available. For training, we use the pre-phonemized text from the dataset. """ try: from viphoneme import vi2ipa phonemized = vi2ipa(text) phones, tones, word2ph = parse_ipa_phonemes(phonemized) except ImportError: # Fallback: use character-based representation phones, tones, word2ph = g2p_char_based(text) # Add start and end tokens phones = ["_"] + phones + ["_"] tones = [0] + tones + [0] word2ph = [1] + word2ph + [1] return phones, tones, word2ph def g2p_char_based(text): """ Character-based G2P with Vietnamese to IPA mapping. """ phones = [] tones = [] word2ph = [] # Vietnamese tone marks to tone number mapping tone_marks = { '\u0300': 2, # à - huyền '\u0301': 1, # á - sắc '\u0303': 3, # ã - ngã '\u0309': 4, # ả - hỏi '\u0323': 5, # ạ - nặng } # Vietnamese character to IPA mapping (COMPREHENSIVE - matching training data) # Multi-char outputs are split into lists to avoid KeyError for missing multi-char symbols vi_to_ipa = { # Multi-char consonants (check these first - ORDER MATTERS) 'ngh': 'ŋ', 'ng': 'ŋ', 'nh': 'ɲ', 'ch': ['t', 'ʃ'], # Vietnamese ch = IPA t + ʃ (separated in training data) 'tr': 'ʈ', # retroflex 'th': ['t', 'h'], # aspirated th 'ph': 'f', 'kh': 'x', # Vietnamese 'kh' = IPA 'x' (matches training data) 'gh': 'ɣ', 'gi': 'z', 'qu': 'kw', # qu -> kw (single symbol in training data) # Special Vietnamese consonants 'đ': 'ɗ', # implosive d # Basic consonants that need IPA mapping 'x': 's', # Vietnamese 'x' = IPA 's' 'c': 'k', # Vietnamese 'c' = IPA 'k' 'd': 'z', # Vietnamese 'd' (northern) = 'z' 'r': 'ɹ', # Vietnamese 'r' = IPA 'ɹ' (matches training data) 's': 's', 'b': 'b', 'g': 'ɣ', 'h': 'h', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'p': 'p', 't': 't', 'v': 'v', 'f': 'f', 'j': 'j', 'w': 'w', 'y': 'j', # Vietnamese 'y' = IPA 'j' (matches training data) # Vowels - MUST match training data phonemes exactly! 'a': 'aː', # Long 'a' (matches training: aː) 'ă': 'a', # Short 'a' 'â': 'ə', # schwa 'e': 'ɛ', # open-mid (matches training: ɛ) 'ê': 'e', # close-mid 'i': 'i', 'o': 'ɔ', # open-mid back (matches training: ɔ) 'ô': 'o', # close-mid back 'ơ': 'əː', # long schwa 'u': 'u', 'ư': 'ɯ', # close back unrounded } words = text.split() for word in words: # Decompose to separate base char and tone mark decomposed = unicodedata.normalize('NFD', word) word_phones = [] current_tone = 0 i = 0 chars = list(decomposed) while i < len(chars): char = chars[i] if char in tone_marks: current_tone = tone_marks[char] i += 1 continue if char in punctuation: if word_phones: phones.extend(word_phones) tones.extend([current_tone] * len(word_phones)) word2ph.append(len(word_phones)) word_phones = [] phones.append(char) tones.append(0) word2ph.append(1) current_tone = 0 i += 1 continue if unicodedata.combining(char): i += 1 continue # Check for multi-char sequences (digraphs/trigraphs) lower_char = char.lower() matched = False # Try trigraphs first if i + 2 < len(chars): trigraph = (lower_char + chars[i+1].lower() + chars[i+2].lower()) if trigraph in vi_to_ipa: result = vi_to_ipa[trigraph] if isinstance(result, list): word_phones.extend(result) else: word_phones.append(result) i += 3 matched = True # Try digraphs if not matched and i + 1 < len(chars): digraph = lower_char + chars[i+1].lower() if digraph in vi_to_ipa: result = vi_to_ipa[digraph] if isinstance(result, list): word_phones.extend(result) else: word_phones.append(result) i += 2 matched = True # Single char if not matched: if lower_char in vi_to_ipa: result = vi_to_ipa[lower_char] if isinstance(result, list): word_phones.extend(result) else: word_phones.append(result) else: word_phones.append(lower_char) i += 1 if word_phones: phones.extend(word_phones) tones.extend([current_tone] * len(word_phones)) word2ph.append(len(word_phones)) # Add boundary tokens phones = ["_"] + phones + ["_"] tones = [0] + tones + [0] word2ph = [1] + word2ph + [1] return phones, tones, word2ph def g2p(text): """ Main G2P function for Vietnamese. Uses character-to-IPA mapping with BERT alignment. """ tok = get_tokenizer() norm_text = text_normalize(text) # Tokenize for BERT alignment tokenized = tok.tokenize(norm_text) # Use character-based G2P with IPA mapping phones, tones, word2ph = g2p_char_based(norm_text) # Ensure word2ph aligns with tokenized output # PhoBERT uses subword tokenization, so we need to distribute phones if len(word2ph) != len(tokenized) + 2: # +2 for start/end tokens # Redistribute word2ph to match tokenized length total_phones = sum(word2ph) new_word2ph = distribute_phones(total_phones, len(tokenized)) word2ph = [1] + new_word2ph + [1] return phones, tones, word2ph def g2p_with_phonemes(text, phonemized_text): """ G2P using pre-phonemized text from dataset. This is the recommended method for training. """ tok = get_tokenizer() # Parse IPA phonemes phones, tones, word2ph = parse_ipa_phonemes(phonemized_text) # Add boundary tokens phones = ["_"] + phones + ["_"] tones = [0] + tones + [0] # Get tokenized text for BERT alignment tokenized = tok.tokenize(text) # Distribute word2ph to match tokenized output + boundaries if word2ph: total_phones = sum(word2ph) new_word2ph = distribute_phones(total_phones, len(tokenized)) word2ph = [1] + new_word2ph + [1] else: word2ph = [1] + [1] * len(tokenized) + [1] return phones, tones, word2ph def distribute_phones(n_phone, n_word): """Distribute phones across words as evenly as possible.""" if n_word == 0: return [] phones_per_word = [n_phone // n_word] * n_word remainder = n_phone % n_word for i in range(remainder): phones_per_word[i] += 1 return phones_per_word def get_bert_feature(text, word2ph, device='cuda'): """Get BERT features for Vietnamese text.""" from . import vietnamese_bert return vietnamese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id) if __name__ == "__main__": # Test test_text = "Xin chào, tôi là một trợ lý AI." test_phonemes = "sˈin tʂˈaːw, tˈoj lˈaː2 mˈo6t tʂˈɤ4 lˈi4 ˌaːˈi." print("Test text:", test_text) print("Normalized:", text_normalize(test_text)) # Test with phonemes phones, tones, word2ph = g2p_with_phonemes(test_text, test_phonemes) print("Phones:", phones) print("Tones:", tones) print("Word2Ph:", word2ph) # Test without phonemes phones2, tones2, word2ph2 = g2p(test_text) print("\nChar-based phones:", phones2) print("Char-based tones:", tones2)