Spaces:

valtecAI-team
/

valtec-vietnamese-tts

Running

App Files Files Community

valtec-vietnamese-tts / src /vietnamese /text_processor.py

valtecAI-team

Upload folder using huggingface_hub

f8b4a6c verified 3 months ago

raw

history blame contribute delete

13.3 kB

	#!/usr/bin/env python3
	"""
	Vietnamese Text Processor for TTS
	Handles normalization of numbers, dates, times, currencies, etc.
	"""

	import re
	import unicodedata


	# Vietnamese number words
	DIGITS = {
	'0': 'không', '1': 'một', '2': 'hai', '3': 'ba', '4': 'bốn',
	'5': 'năm', '6': 'sáu', '7': 'bảy', '8': 'tám', '9': 'chín'
	}

	TEENS = {
	'10': 'mười', '11': 'mười một', '12': 'mười hai', '13': 'mười ba',
	'14': 'mười bốn', '15': 'mười lăm', '16': 'mười sáu', '17': 'mười bảy',
	'18': 'mười tám', '19': 'mười chín'
	}

	TENS = {
	'2': 'hai mươi', '3': 'ba mươi', '4': 'bốn mươi', '5': 'năm mươi',
	'6': 'sáu mươi', '7': 'bảy mươi', '8': 'tám mươi', '9': 'chín mươi'
	}


	def number_to_words(num_str):
	"""
	Convert a number string to Vietnamese words.
	Handles numbers from 0 to billions.
	"""
	# Remove leading zeros but keep at least one digit
	num_str = num_str.lstrip('0') or '0'

	# Handle negative numbers
	if num_str.startswith('-'):
	return 'âm ' + number_to_words(num_str[1:])

	# Convert to integer for processing
	try:
	num = int(num_str)
	except ValueError:
	return num_str

	if num == 0:
	return 'không'

	if num < 10:
	return DIGITS[str(num)]

	if num < 20:
	return TEENS[str(num)]

	if num < 100:
	tens = num // 10
	units = num % 10
	if units == 0:
	return TENS[str(tens)]
	elif units == 1:
	return TENS[str(tens)] + ' mốt'
	elif units == 4:
	return TENS[str(tens)] + ' tư'
	elif units == 5:
	return TENS[str(tens)] + ' lăm'
	else:
	return TENS[str(tens)] + ' ' + DIGITS[str(units)]

	if num < 1000:
	hundreds = num // 100
	remainder = num % 100
	result = DIGITS[str(hundreds)] + ' trăm'
	if remainder == 0:
	return result
	elif remainder < 10:
	return result + ' lẻ ' + DIGITS[str(remainder)]
	else:
	return result + ' ' + number_to_words(str(remainder))

	if num < 1000000:
	thousands = num // 1000
	remainder = num % 1000
	result = number_to_words(str(thousands)) + ' nghìn'
	if remainder == 0:
	return result
	elif remainder < 100:
	return result + ' không trăm ' + number_to_words(str(remainder))
	else:
	return result + ' ' + number_to_words(str(remainder))

	if num < 1000000000:
	millions = num // 1000000
	remainder = num % 1000000
	result = number_to_words(str(millions)) + ' triệu'
	if remainder == 0:
	return result
	else:
	return result + ' ' + number_to_words(str(remainder))

	if num < 1000000000000:
	billions = num // 1000000000
	remainder = num % 1000000000
	result = number_to_words(str(billions)) + ' tỷ'
	if remainder == 0:
	return result
	else:
	return result + ' ' + number_to_words(str(remainder))

	# For very large numbers, read digit by digit
	return ' '.join(DIGITS.get(d, d) for d in num_str)


	def convert_decimal(text):
	"""Convert decimal numbers: 3.14 -> ba phẩy mười bốn"""
	def replace_decimal(match):
	integer_part = match.group(1)
	decimal_part = match.group(2)

	integer_words = number_to_words(integer_part)

	# Read decimal part as a number
	decimal_words = number_to_words(decimal_part.lstrip('0') or '0')

	return f"{integer_words} phẩy {decimal_words}"

	# Match decimal numbers: X.Y where Y is 1-2 digits, followed by space or end
	# Avoid matching large numbers like 100.000 (thousand separator)
	text = re.sub(r'(\d+)\.(\d{1,2})(?=\s\|$\|[^\d])', replace_decimal, text)
	return text


	def convert_percentage(text):
	"""Convert percentages: 50% -> năm mươi phần trăm"""
	def replace_percent(match):
	num = match.group(1)
	return number_to_words(num) + ' phần trăm'

	text = re.sub(r'(\d+(?:[.,]\d+)?)\s*%', replace_percent, text)
	return text


	def convert_currency(text):
	"""Convert currency amounts"""
	# Vietnamese Dong - be specific to avoid matching "đ" in other words like "độ"
	def replace_vnd(match):
	num = match.group(1).replace('.', '').replace(',', '')
	return number_to_words(num) + ' đồng'

	# Only match currency patterns: number followed by currency symbol at word boundary
	text = re.sub(r'(\d+(?:[.,]\d+))\s(?:đồng\|VND\|vnđ)\b', replace_vnd, text, flags=re.IGNORECASE)
	text = re.sub(r'(\d+(?:[.,]\d+)*)đ(?![a-zà-ỹ])', replace_vnd, text, flags=re.IGNORECASE)

	# USD
	def replace_usd(match):
	num = match.group(1).replace('.', '').replace(',', '')
	return number_to_words(num) + ' đô la'

	text = re.sub(r'\$\s(\d+(?:[.,]\d+))', replace_usd, text)
	text = re.sub(r'(\d+(?:[.,]\d+))\s(?:USD\|\$)', replace_usd, text, flags=re.IGNORECASE)

	return text


	def convert_time(text):
	"""Convert time expressions: 2 giờ 20 phút -> hai giờ hai mươi phút"""
	def replace_time(match):
	hour = match.group(1)
	minute = match.group(2) if match.group(2) else None
	second = match.group(3) if len(match.groups()) > 2 and match.group(3) else None

	result = number_to_words(hour) + ' giờ'
	if minute:
	result += ' ' + number_to_words(minute) + ' phút'
	if second:
	result += ' ' + number_to_words(second) + ' giây'
	return result

	# HH:MM:SS or HH:MM
	text = re.sub(r'(\d{1,2}):(\d{2})(?::(\d{2}))?', replace_time, text)

	# X giờ Y phút
	def replace_time_vn(match):
	hour = match.group(1)
	minute = match.group(2)
	return number_to_words(hour) + ' giờ ' + number_to_words(minute) + ' phút'

	text = re.sub(r'(\d+)\sgiờ\s(\d+)\s*phút', replace_time_vn, text)

	# X giờ (without minute)
	def replace_hour(match):
	hour = match.group(1)
	return number_to_words(hour) + ' giờ'

	text = re.sub(r'(\d+)\sgiờ(?!\s\d)', replace_hour, text)

	return text


	def convert_date(text):
	"""Convert date expressions"""
	# DD/MM/YYYY or DD-MM-YYYY
	def replace_date_full(match):
	day = match.group(1)
	month = match.group(2)
	year = match.group(3)
	return f"ngày {number_to_words(day)} tháng {number_to_words(month)} năm {number_to_words(year)}"

	# First, replace "Sinh ngày DD/MM/YYYY" pattern to avoid double "ngày"
	text = re.sub(r'(Sinh\|sinh)\s+ngày\s+(\d{1,2})[/-](\d{1,2})[/-](\d{4})',
	lambda m: f"{m.group(1)} ngày {number_to_words(m.group(2))} tháng {number_to_words(m.group(3))} năm {number_to_words(m.group(4))}", text)

	text = re.sub(r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', replace_date_full, text)

	# X tháng Y
	def replace_month_day(match):
	day = match.group(1)
	month = match.group(2)
	return f"ngày {number_to_words(day)} tháng {number_to_words(month)}"

	text = re.sub(r'(\d+)\stháng\s(\d+)', replace_month_day, text)

	# tháng X (month only)
	def replace_month(match):
	month = match.group(1)
	return 'tháng ' + number_to_words(month)

	text = re.sub(r'tháng\s*(\d+)', replace_month, text)

	# ngày X
	def replace_day(match):
	day = match.group(1)
	return 'ngày ' + number_to_words(day)

	text = re.sub(r'ngày\s*(\d+)', replace_day, text)

	return text


	def convert_year_range(text):
	"""Convert year ranges: 1873-1907 -> một nghìn tám trăm bảy mươi ba đến một nghìn chín trăm lẻ bảy"""
	def replace_year_range(match):
	year1 = match.group(1)
	year2 = match.group(2)
	return number_to_words(year1) + ' đến ' + number_to_words(year2)

	text = re.sub(r'(\d{4})\s[-–—]\s(\d{4})', replace_year_range, text)
	return text


	def convert_ordinal(text):
	"""Convert ordinals: thứ 2 -> thứ hai"""
	ordinal_map = {
	'1': 'nhất', '2': 'hai', '3': 'ba', '4': 'tư', '5': 'năm',
	'6': 'sáu', '7': 'bảy', '8': 'tám', '9': 'chín', '10': 'mười'
	}

	def replace_ordinal(match):
	prefix = match.group(1)
	num = match.group(2)
	if num in ordinal_map:
	return prefix + ' ' + ordinal_map[num]
	return prefix + ' ' + number_to_words(num)

	# thứ X, lần X, bước X, phần X
	text = re.sub(r'(thứ\|lần\|bước\|phần\|chương\|tập\|số)\s*(\d+)', replace_ordinal, text, flags=re.IGNORECASE)

	return text


	def convert_standalone_numbers(text):
	"""Convert remaining standalone numbers to words"""
	def replace_num(match):
	num = match.group(0)
	# Skip if it's part of a word or already processed
	return number_to_words(num)

	# Match numbers not followed/preceded by letters
	text = re.sub(r'\b\d+\b', replace_num, text)
	return text


	def convert_phone_number(text):
	"""Read phone numbers digit by digit"""
	def replace_phone(match):
	phone = match.group(0)
	digits = re.findall(r'\d', phone)
	return ' '.join(DIGITS.get(d, d) for d in digits)

	# Vietnamese phone patterns
	text = re.sub(r'0\d{9,10}', replace_phone, text)
	text = re.sub(r'\+84\d{9,10}', replace_phone, text)

	return text


	def normalize_unicode(text):
	"""Normalize Unicode to NFC form"""
	return unicodedata.normalize('NFC', text)


	def clean_whitespace(text):
	"""Clean up extra whitespace"""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()


	def remove_special_chars(text):
	"""Remove or replace special characters that can't be spoken"""
	# Keep Vietnamese diacritics and common punctuation
	# Remove emojis and special symbols

	# Replace common symbols with words
	text = text.replace('&', ' và ')
	text = text.replace('@', ' a còng ')
	text = text.replace('#', ' thăng ')
	text = text.replace('*', '')
	text = text.replace('_', ' ')
	text = text.replace('~', '')
	text = text.replace('`', '')
	text = text.replace('^', '')

	# Remove URLs
	text = re.sub(r'https?://\S+', '', text)
	text = re.sub(r'www\.\S+', '', text)

	# Remove email addresses
	text = re.sub(r'\S+@\S+\.\S+', '', text)

	return text


	def normalize_punctuation(text):
	"""Normalize punctuation marks"""
	# Normalize quotes
	text = re.sub(r'[""„‟]', '"', text)
	text = re.sub(r"[''‚‛]", "'", text)

	# Normalize dashes
	text = re.sub(r'[–—−]', '-', text)

	# Normalize ellipsis
	text = re.sub(r'\.{3,}', '...', text)
	text = text.replace('…', '...')

	# Remove multiple punctuation
	text = re.sub(r'([!?.]){2,}', r'\1', text)

	return text


	def process_vietnamese_text(text):
	"""
	Main function to process Vietnamese text for TTS.
	Applies all normalization steps in the correct order.

	Args:
	text: Raw Vietnamese text

	Returns:
	Normalized text suitable for TTS
	"""
	# Step 1: Normalize Unicode
	text = normalize_unicode(text)

	# Step 2: Remove special characters
	text = remove_special_chars(text)

	# Step 3: Normalize punctuation
	text = normalize_punctuation(text)

	# Step 4: Convert year ranges (before other number conversions)
	text = convert_year_range(text)

	# Step 5: Convert dates
	text = convert_date(text)

	# Step 6: Convert times
	text = convert_time(text)

	# Step 7: Convert ordinals
	text = convert_ordinal(text)

	# Step 8: Convert currency
	text = convert_currency(text)

	# Step 9: Convert percentages
	text = convert_percentage(text)

	# Step 10: Convert phone numbers
	text = convert_phone_number(text)

	# Step 11: Convert decimals (before standalone numbers, after currency)
	text = convert_decimal(text)

	# Step 12: Convert remaining standalone numbers
	text = convert_standalone_numbers(text)

	# Step 13: Clean whitespace
	text = clean_whitespace(text)

	return text


	if __name__ == "__main__":
	# Test cases
	test_cases = [
	"Lúc khoảng 2 giờ 20 phút sáng ngày thứ Bảy hay 8 tháng 11",
	"Alfred Jarry 1873-1907 hợp những nhà văn",
	"ông Derringer 44 ly, dí sát đầu tổng thống",
	"Giá sản phẩm là 100.000đ",
	"Tỷ lệ thành công đạt 85%",
	"Họp lúc 14:30",
	"Sinh ngày 15/08/1990",
	"Chương 3: Hành trình mới",
	"Số điện thoại: 0912345678",
	"Nhiệt độ 25.5 độ C",
	"Công ty XYZ có 1500 nhân viên",
	]

	print("=" * 60)
	print("Vietnamese Text Processor Test")
	print("=" * 60)

	for text in test_cases:
	processed = process_vietnamese_text(text)
	print(f"\nOriginal: {text}")
	print(f"Processed: {processed}")