TTS-Demo

Sleeping

App Files Files Community

TTS-Demo / src /text /vietnamese.py

valtecAI-team

Upload folder using huggingface_hub

f8b4a6c verified 2 months ago

raw

history blame contribute delete

13.6 kB

	import re
	import unicodedata
	from transformers import AutoTokenizer
	from . import punctuation, symbols

	# Vietnamese BERT model
	model_id = 'vinai/phobert-base-v2'
	tokenizer = None

	def get_tokenizer():
	global tokenizer
	if tokenizer is None:
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	return tokenizer

	# Vietnamese IPA phoneme set based on VieNeu-TTS-140h dataset
	# These are extracted from the phonemized_text field in the dataset
	VI_IPA_CONSONANTS = [
	'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'w', 'x', 'z',
	'ŋ', # ng
	'ɲ', # nh
	'ʈ', # tr
	'ɖ', # đ
	'tʰ', # th
	'kʰ', # kh
	'ʂ', # s (southern)
	'ɣ', # g (southern)
	'χ', # x (some dialects)
	]

	VI_IPA_VOWELS = [
	'a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'ơ', 'u', 'ư', 'y',
	'ə', # ơ
	'ɛ', # e
	'ɔ', # o
	'ɯ', # ư
	'ɤ', # ơ variant
	'ɐ', # a short
	'ʊ', # u short
	'ɪ', # i short
	'ʌ', # â
	'æ', # a variant
	]

	# Vietnamese tone markers (numbers 1-6 or ˈ ˌ for stress)
	VI_TONE_MARKERS = ['1', '2', '3', '4', '5', '6', 'ˈ', 'ˌ', 'ː']

	# Combined IPA symbols used in VieNeu-TTS dataset
	VI_IPA_SYMBOLS = [
	# Consonants
	'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'w', 'x', 'z',
	'ŋ', 'ɲ', 'ʈ', 'ɖ', 'ʂ', 'ɣ', 'χ', 'ʔ',
	# Vowels
	'a', 'ă', 'e', 'i', 'o', 'u', 'y',
	'ə', 'ɛ', 'ɔ', 'ɯ', 'ɤ', 'ɐ', 'ʊ', 'ɪ', 'ʌ', 'æ', 'ɑ',
	# Special markers
	'ˈ', 'ˌ', 'ː',
	# Tone numbers
	'1', '2', '3', '4', '5', '6',
	]

	def normalize_vietnamese_text(text):
	"""Normalize Vietnamese text."""
	# Normalize unicode
	text = unicodedata.normalize('NFC', text)

	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()

	# Convert numbers to words (basic)
	text = convert_numbers_to_vietnamese(text)

	return text

	def convert_numbers_to_vietnamese(text):
	"""Convert numbers to Vietnamese words (basic implementation)."""
	num_map = {
	'0': 'không', '1': 'một', '2': 'hai', '3': 'ba', '4': 'bốn',
	'5': 'năm', '6': 'sáu', '7': 'bảy', '8': 'tám', '9': 'chín',
	'10': 'mười', '100': 'trăm', '1000': 'nghìn'
	}

	# Simple replacement for single digits in context
	def replace_num(match):
	num = match.group(0)
	if num in num_map:
	return num_map[num]
	return num

	# Only replace standalone numbers
	text = re.sub(r'\b\d\b', replace_num, text)
	return text

	def text_normalize(text):
	"""Normalize text for Vietnamese TTS."""
	text = normalize_vietnamese_text(text)
	return text

	def parse_ipa_phonemes(phonemized_text):
	"""
	Parse IPA phonemized text from VieNeu-TTS dataset.
	Example: "ŋˈyə2j ŋˈyə2j bˈan xwˈan vˈe2"
	Returns: phones, tones, word2ph
	"""
	phones = []
	tones = []
	word2ph = []

	# Split by space to get words
	words = phonemized_text.strip().split()

	for word in words:
	word_phones = []
	word_tones = []

	# Parse each character/symbol in the word
	i = 0
	current_tone = 0 # Default tone (neutral/tone 1)

	while i < len(word):
	char = word[i]

	# Check for tone numbers (1-6)
	if char.isdigit():
	current_tone = int(char)
	i += 1
	continue

	# Check for stress markers
	if char in ['ˈ', 'ˌ']:
	# Primary or secondary stress - could be used as tone variant
	i += 1
	continue

	# Check for length marker
	if char == 'ː':
	# Long vowel marker - append to previous phone if exists
	if word_phones:
	word_phones[-1] = word_phones[-1] + 'ː'
	i += 1
	continue

	# Check for punctuation
	if char in punctuation:
	if word_phones:
	phones.extend(word_phones)
	tones.extend([current_tone] * len(word_phones))
	word2ph.append(len(word_phones))
	word_phones = []
	word_tones = []
	phones.append(char)
	tones.append(0)
	word2ph.append(1)
	i += 1
	continue

	# Regular phoneme
	word_phones.append(char)
	i += 1

	# Apply collected tone to all phones in this word
	if word_phones:
	phones.extend(word_phones)
	tones.extend([current_tone] * len(word_phones))
	word2ph.append(len(word_phones))

	return phones, tones, word2ph

	def g2p_ipa(text):
	"""
	Convert text to phonemes using external IPA converter.
	This is a fallback for when phonemized_text is not available.
	For training, we use the pre-phonemized text from the dataset.
	"""
	try:
	from viphoneme import vi2ipa
	phonemized = vi2ipa(text)
	phones, tones, word2ph = parse_ipa_phonemes(phonemized)
	except ImportError:
	# Fallback: use character-based representation
	phones, tones, word2ph = g2p_char_based(text)

	# Add start and end tokens
	phones = ["_"] + phones + ["_"]
	tones = [0] + tones + [0]
	word2ph = [1] + word2ph + [1]

	return phones, tones, word2ph

	def g2p_char_based(text):
	"""
	Character-based G2P with Vietnamese to IPA mapping.
	"""
	phones = []
	tones = []
	word2ph = []

	# Vietnamese tone marks to tone number mapping
	tone_marks = {
	'\u0300': 2, # à - huyền
	'\u0301': 1, # á - sắc
	'\u0303': 3, # ã - ngã
	'\u0309': 4, # ả - hỏi
	'\u0323': 5, # ạ - nặng
	}

	# Vietnamese character to IPA mapping (COMPREHENSIVE - matching training data)
	# Multi-char outputs are split into lists to avoid KeyError for missing multi-char symbols
	vi_to_ipa = {
	# Multi-char consonants (check these first - ORDER MATTERS)
	'ngh': 'ŋ',
	'ng': 'ŋ',
	'nh': 'ɲ',
	'ch': ['t', 'ʃ'], # Vietnamese ch = IPA t + ʃ (separated in training data)
	'tr': 'ʈ', # retroflex
	'th': ['t', 'h'], # aspirated th
	'ph': 'f',
	'kh': 'x', # Vietnamese 'kh' = IPA 'x' (matches training data)
	'gh': 'ɣ',
	'gi': 'z',
	'qu': 'kw', # qu -> kw (single symbol in training data)
	# Special Vietnamese consonants
	'đ': 'ɗ', # implosive d
	# Basic consonants that need IPA mapping
	'x': 's', # Vietnamese 'x' = IPA 's'
	'c': 'k', # Vietnamese 'c' = IPA 'k'
	'd': 'z', # Vietnamese 'd' (northern) = 'z'
	'r': 'ɹ', # Vietnamese 'r' = IPA 'ɹ' (matches training data)
	's': 's',
	'b': 'b',
	'g': 'ɣ',
	'h': 'h',
	'k': 'k',
	'l': 'l',
	'm': 'm',
	'n': 'n',
	'p': 'p',
	't': 't',
	'v': 'v',
	'f': 'f',
	'j': 'j',
	'w': 'w',
	'y': 'j', # Vietnamese 'y' = IPA 'j' (matches training data)
	# Vowels - MUST match training data phonemes exactly!
	'a': 'aː', # Long 'a' (matches training: aː)
	'ă': 'a', # Short 'a'
	'â': 'ə', # schwa
	'e': 'ɛ', # open-mid (matches training: ɛ)
	'ê': 'e', # close-mid
	'i': 'i',
	'o': 'ɔ', # open-mid back (matches training: ɔ)
	'ô': 'o', # close-mid back
	'ơ': 'əː', # long schwa
	'u': 'u',
	'ư': 'ɯ', # close back unrounded
	}

	words = text.split()
	for word in words:
	# Decompose to separate base char and tone mark
	decomposed = unicodedata.normalize('NFD', word)
	word_phones = []
	current_tone = 0

	i = 0
	chars = list(decomposed)
	while i < len(chars):
	char = chars[i]

	if char in tone_marks:
	current_tone = tone_marks[char]
	i += 1
	continue

	if char in punctuation:
	if word_phones:
	phones.extend(word_phones)
	tones.extend([current_tone] * len(word_phones))
	word2ph.append(len(word_phones))
	word_phones = []
	phones.append(char)
	tones.append(0)
	word2ph.append(1)
	current_tone = 0
	i += 1
	continue

	if unicodedata.combining(char):
	i += 1
	continue

	# Check for multi-char sequences (digraphs/trigraphs)
	lower_char = char.lower()
	matched = False

	# Try trigraphs first
	if i + 2 < len(chars):
	trigraph = (lower_char + chars[i+1].lower() + chars[i+2].lower())
	if trigraph in vi_to_ipa:
	result = vi_to_ipa[trigraph]
	if isinstance(result, list):
	word_phones.extend(result)
	else:
	word_phones.append(result)
	i += 3
	matched = True

	# Try digraphs
	if not matched and i + 1 < len(chars):
	digraph = lower_char + chars[i+1].lower()
	if digraph in vi_to_ipa:
	result = vi_to_ipa[digraph]
	if isinstance(result, list):
	word_phones.extend(result)
	else:
	word_phones.append(result)
	i += 2
	matched = True

	# Single char
	if not matched:
	if lower_char in vi_to_ipa:
	result = vi_to_ipa[lower_char]
	if isinstance(result, list):
	word_phones.extend(result)
	else:
	word_phones.append(result)
	else:
	word_phones.append(lower_char)
	i += 1

	if word_phones:
	phones.extend(word_phones)
	tones.extend([current_tone] * len(word_phones))
	word2ph.append(len(word_phones))

	# Add boundary tokens
	phones = ["_"] + phones + ["_"]
	tones = [0] + tones + [0]
	word2ph = [1] + word2ph + [1]

	return phones, tones, word2ph

	def g2p(text):
	"""
	Main G2P function for Vietnamese.
	Uses character-to-IPA mapping with BERT alignment.
	"""
	tok = get_tokenizer()
	norm_text = text_normalize(text)

	# Tokenize for BERT alignment
	tokenized = tok.tokenize(norm_text)

	# Use character-based G2P with IPA mapping
	phones, tones, word2ph = g2p_char_based(norm_text)

	# Ensure word2ph aligns with tokenized output
	# PhoBERT uses subword tokenization, so we need to distribute phones
	if len(word2ph) != len(tokenized) + 2: # +2 for start/end tokens
	# Redistribute word2ph to match tokenized length
	total_phones = sum(word2ph)
	new_word2ph = distribute_phones(total_phones, len(tokenized))
	word2ph = [1] + new_word2ph + [1]

	return phones, tones, word2ph

	def g2p_with_phonemes(text, phonemized_text):
	"""
	G2P using pre-phonemized text from dataset.
	This is the recommended method for training.
	"""
	tok = get_tokenizer()

	# Parse IPA phonemes
	phones, tones, word2ph = parse_ipa_phonemes(phonemized_text)

	# Add boundary tokens
	phones = ["_"] + phones + ["_"]
	tones = [0] + tones + [0]

	# Get tokenized text for BERT alignment
	tokenized = tok.tokenize(text)

	# Distribute word2ph to match tokenized output + boundaries
	if word2ph:
	total_phones = sum(word2ph)
	new_word2ph = distribute_phones(total_phones, len(tokenized))
	word2ph = [1] + new_word2ph + [1]
	else:
	word2ph = [1] + [1] * len(tokenized) + [1]

	return phones, tones, word2ph

	def distribute_phones(n_phone, n_word):
	"""Distribute phones across words as evenly as possible."""
	if n_word == 0:
	return []
	phones_per_word = [n_phone // n_word] * n_word
	remainder = n_phone % n_word
	for i in range(remainder):
	phones_per_word[i] += 1
	return phones_per_word

	def get_bert_feature(text, word2ph, device='cuda'):
	"""Get BERT features for Vietnamese text."""
	from . import vietnamese_bert
	return vietnamese_bert.get_bert_feature(text, word2ph, device=device, model_id=model_id)


	if __name__ == "__main__":
	# Test
	test_text = "Xin chào, tôi là một trợ lý AI."
	test_phonemes = "sˈin tʂˈaːw, tˈoj lˈaː2 mˈo6t tʂˈɤ4 lˈi4 ˌaːˈi."

	print("Test text:", test_text)
	print("Normalized:", text_normalize(test_text))

	# Test with phonemes
	phones, tones, word2ph = g2p_with_phonemes(test_text, test_phonemes)
	print("Phones:", phones)
	print("Tones:", tones)
	print("Word2Ph:", word2ph)

	# Test without phonemes
	phones2, tones2, word2ph2 = g2p(test_text)
	print("\nChar-based phones:", phones2)
	print("Char-based tones:", tones2)