Spaces:
Running
on
Zero
Running
on
Zero
Upload 4 files
Browse files- utils/__init__.py +0 -0
- utils/normalize_text.py +408 -0
- utils/phoneme_dict.json +0 -0
- utils/phonemize_text.py +150 -0
utils/__init__.py
ADDED
|
File without changes
|
utils/normalize_text.py
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
class VietnameseTTSNormalizer:
|
| 4 |
+
"""
|
| 5 |
+
A text normalizer for Vietnamese Text-to-Speech systems.
|
| 6 |
+
Converts numbers, dates, units, and special characters into readable Vietnamese text.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.units = {
|
| 11 |
+
'km': 'ki lô mét', 'dm': 'đê xi mét', 'cm': 'xen ti mét',
|
| 12 |
+
'mm': 'mi li mét', 'nm': 'na nô mét', 'µm': 'mic rô mét',
|
| 13 |
+
'μm': 'mic rô mét', 'm': 'mét',
|
| 14 |
+
|
| 15 |
+
'kg': 'ki lô gam', 'g': 'gam', 'mg': 'mi li gam',
|
| 16 |
+
|
| 17 |
+
'km²': 'ki lô mét vuông', 'km2': 'ki lô mét vuông',
|
| 18 |
+
'm²': 'mét vuông', 'm2': 'mét vuông',
|
| 19 |
+
'cm²': 'xen ti mét vuông', 'cm2': 'xen ti mét vuông',
|
| 20 |
+
'mm²': 'mi li mét vuông', 'mm2': 'mi li mét vuông',
|
| 21 |
+
'ha': 'héc ta',
|
| 22 |
+
|
| 23 |
+
'km³': 'ki lô mét khối', 'km3': 'ki lô mét khối',
|
| 24 |
+
'm³': 'mét khối', 'm3': 'mét khối',
|
| 25 |
+
'cm³': 'xen ti mét khối', 'cm3': 'xen ti mét khối',
|
| 26 |
+
'mm³': 'mi li mét khối', 'mm3': 'mi li mét khối',
|
| 27 |
+
'l': 'lít', 'dl': 'đê xi lít', 'ml': 'mi li lít', 'hl': 'héc tô lít',
|
| 28 |
+
|
| 29 |
+
'v': 'vôn', 'kv': 'ki lô vôn', 'mv': 'mi li vôn',
|
| 30 |
+
'a': 'am pe', 'ma': 'mi li am pe', 'ka': 'ki lô am pe',
|
| 31 |
+
'w': 'oát', 'kw': 'ki lô oát', 'mw': 'mê ga oát', 'gw': 'gi ga oát',
|
| 32 |
+
'kwh': 'ki lô oát giờ', 'mwh': 'mê ga oát giờ', 'wh': 'oát giờ',
|
| 33 |
+
'ω': 'ôm', 'ohm': 'ôm', 'kω': 'ki lô ôm', 'mω': 'mê ga ôm',
|
| 34 |
+
|
| 35 |
+
'hz': 'héc', 'khz': 'ki lô héc', 'mhz': 'mê ga héc', 'ghz': 'gi ga héc',
|
| 36 |
+
|
| 37 |
+
'pa': 'pát cal', 'kpa': 'ki lô pát cal', 'mpa': 'mê ga pát cal',
|
| 38 |
+
'bar': 'ba', 'mbar': 'mi li ba', 'atm': 'át mốt phia', 'psi': 'pi ét xai',
|
| 39 |
+
|
| 40 |
+
'j': 'giun', 'kj': 'ki lô giun',
|
| 41 |
+
'cal': 'ca lo', 'kcal': 'ki lô ca lo',
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
self.digits = ['không', 'một', 'hai', 'ba', 'bốn',
|
| 45 |
+
'năm', 'sáu', 'bảy', 'tám', 'chín']
|
| 46 |
+
|
| 47 |
+
def normalize(self, text):
|
| 48 |
+
"""Main normalization pipeline."""
|
| 49 |
+
text = text.lower()
|
| 50 |
+
text = self._normalize_temperature(text)
|
| 51 |
+
text = self._normalize_currency(text)
|
| 52 |
+
text = self._normalize_percentage(text)
|
| 53 |
+
text = self._normalize_units(text)
|
| 54 |
+
text = self._normalize_time(text)
|
| 55 |
+
text = self._normalize_date(text)
|
| 56 |
+
text = self._normalize_phone(text)
|
| 57 |
+
text = self._normalize_numbers(text)
|
| 58 |
+
text = self._number_to_words(text)
|
| 59 |
+
text = self._normalize_special_chars(text)
|
| 60 |
+
text = self._normalize_whitespace(text)
|
| 61 |
+
return text
|
| 62 |
+
|
| 63 |
+
def _normalize_temperature(self, text):
|
| 64 |
+
"""Convert temperature notation to words."""
|
| 65 |
+
text = re.sub(r'-(\d+(?:[.,]\d+)?)\s*°\s*c\b', r'âm \1 độ xê', text, flags=re.IGNORECASE)
|
| 66 |
+
text = re.sub(r'-(\d+(?:[.,]\d+)?)\s*°\s*f\b', r'âm \1 độ ép', text, flags=re.IGNORECASE)
|
| 67 |
+
text = re.sub(r'(\d+(?:[.,]\d+)?)\s*°\s*c\b', r'\1 độ xê', text, flags=re.IGNORECASE)
|
| 68 |
+
text = re.sub(r'(\d+(?:[.,]\d+)?)\s*°\s*f\b', r'\1 độ ép', text, flags=re.IGNORECASE)
|
| 69 |
+
text = re.sub(r'°', ' độ ', text)
|
| 70 |
+
return text
|
| 71 |
+
|
| 72 |
+
def _normalize_currency(self, text):
|
| 73 |
+
"""Convert currency notation to words."""
|
| 74 |
+
def decimal_currency(match):
|
| 75 |
+
whole = match.group(1)
|
| 76 |
+
decimal = match.group(2)
|
| 77 |
+
unit = match.group(3)
|
| 78 |
+
decimal_words = ' '.join([self.digits[int(d)] for d in decimal])
|
| 79 |
+
unit_map = {'k': 'nghìn', 'm': 'triệu', 'b': 'tỷ'}
|
| 80 |
+
unit_word = unit_map.get(unit.lower(), unit)
|
| 81 |
+
return f"{whole} phẩy {decimal_words} {unit_word}"
|
| 82 |
+
|
| 83 |
+
text = re.sub(r'(\d+)[.,](\d+)\s*([kmb])\b', decimal_currency, text, flags=re.IGNORECASE)
|
| 84 |
+
text = re.sub(r'(\d+)\s*k\b', r'\1 nghìn', text, flags=re.IGNORECASE)
|
| 85 |
+
text = re.sub(r'(\d+)\s*m\b', r'\1 triệu', text, flags=re.IGNORECASE)
|
| 86 |
+
text = re.sub(r'(\d+)\s*b\b', r'\1 tỷ', text, flags=re.IGNORECASE)
|
| 87 |
+
text = re.sub(r'(\d+(?:[.,]\d+)?)\s*đ\b', r'\1 đồng', text)
|
| 88 |
+
text = re.sub(r'(\d+(?:[.,]\d+)?)\s*vnd\b', r'\1 đồng', text, flags=re.IGNORECASE)
|
| 89 |
+
text = re.sub(r'\$\s*(\d+(?:[.,]\d+)?)', r'\1 đô la', text)
|
| 90 |
+
text = re.sub(r'(\d+(?:[.,]\d+)?)\s*\$', r'\1 đô la', text)
|
| 91 |
+
return text
|
| 92 |
+
|
| 93 |
+
def _normalize_percentage(self, text):
|
| 94 |
+
"""Convert percentage to words."""
|
| 95 |
+
text = re.sub(r'(\d+(?:[.,]\d+)?)\s*%', r'\1 phần trăm', text)
|
| 96 |
+
return text
|
| 97 |
+
|
| 98 |
+
def _normalize_units(self, text):
|
| 99 |
+
"""Convert measurement units to words."""
|
| 100 |
+
def expand_compound_with_number(match):
|
| 101 |
+
number = match.group(1)
|
| 102 |
+
unit1 = match.group(2).lower()
|
| 103 |
+
unit2 = match.group(3).lower()
|
| 104 |
+
full_unit1 = self.units.get(unit1, unit1)
|
| 105 |
+
full_unit2 = self.units.get(unit2, unit2)
|
| 106 |
+
return f"{number} {full_unit1} trên {full_unit2}"
|
| 107 |
+
|
| 108 |
+
def expand_compound_without_number(match):
|
| 109 |
+
unit1 = match.group(1).lower()
|
| 110 |
+
unit2 = match.group(2).lower()
|
| 111 |
+
full_unit1 = self.units.get(unit1, unit1)
|
| 112 |
+
full_unit2 = self.units.get(unit2, unit2)
|
| 113 |
+
return f"{full_unit1} trên {full_unit2}"
|
| 114 |
+
|
| 115 |
+
text = re.sub(r'(\d+(?:[.,]\d+)?)\s*([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\b',
|
| 116 |
+
expand_compound_with_number, text)
|
| 117 |
+
text = re.sub(r'\b([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\b',
|
| 118 |
+
expand_compound_without_number, text)
|
| 119 |
+
|
| 120 |
+
sorted_units = sorted(self.units.items(), key=lambda x: len(x[0]), reverse=True)
|
| 121 |
+
for unit, full_name in sorted_units:
|
| 122 |
+
pattern = r'(\d+(?:[.,]\d+)?)\s*' + re.escape(unit) + r'\b'
|
| 123 |
+
text = re.sub(pattern, rf'\1 {full_name}', text, flags=re.IGNORECASE)
|
| 124 |
+
|
| 125 |
+
for unit, full_name in sorted_units:
|
| 126 |
+
if any(c in unit for c in '²³°'):
|
| 127 |
+
pattern = r'\b' + re.escape(unit) + r'\b'
|
| 128 |
+
text = re.sub(pattern, full_name, text, flags=re.IGNORECASE)
|
| 129 |
+
|
| 130 |
+
return text
|
| 131 |
+
|
| 132 |
+
def _normalize_time(self, text):
|
| 133 |
+
"""Convert time notation to words with validation."""
|
| 134 |
+
|
| 135 |
+
def validate_and_convert_time(match):
|
| 136 |
+
"""Validate time components before converting."""
|
| 137 |
+
groups = match.groups()
|
| 138 |
+
|
| 139 |
+
# HH:MM:SS format
|
| 140 |
+
if len(groups) == 3:
|
| 141 |
+
hour, minute, second = groups
|
| 142 |
+
hour_int, minute_int, second_int = int(hour), int(minute), int(second)
|
| 143 |
+
|
| 144 |
+
# Validate ranges
|
| 145 |
+
if not (0 <= hour_int <= 23):
|
| 146 |
+
return match.group(0) # Return original if invalid
|
| 147 |
+
if not (0 <= minute_int <= 59):
|
| 148 |
+
return match.group(0)
|
| 149 |
+
if not (0 <= second_int <= 59):
|
| 150 |
+
return match.group(0)
|
| 151 |
+
|
| 152 |
+
return f"{hour} giờ {minute} phút {second} giây"
|
| 153 |
+
|
| 154 |
+
# HH:MM or HHhMM format
|
| 155 |
+
elif len(groups) == 2:
|
| 156 |
+
hour, minute = groups
|
| 157 |
+
hour_int, minute_int = int(hour), int(minute)
|
| 158 |
+
|
| 159 |
+
# Validate ranges
|
| 160 |
+
if not (0 <= hour_int <= 23):
|
| 161 |
+
return match.group(0)
|
| 162 |
+
if not (0 <= minute_int <= 59):
|
| 163 |
+
return match.group(0)
|
| 164 |
+
|
| 165 |
+
return f"{hour} giờ {minute} phút"
|
| 166 |
+
|
| 167 |
+
# HHh format
|
| 168 |
+
else:
|
| 169 |
+
hour = groups[0]
|
| 170 |
+
hour_int = int(hour)
|
| 171 |
+
|
| 172 |
+
if not (0 <= hour_int <= 23):
|
| 173 |
+
return match.group(0)
|
| 174 |
+
|
| 175 |
+
return f"{hour} giờ"
|
| 176 |
+
|
| 177 |
+
# Apply patterns with validation
|
| 178 |
+
text = re.sub(r'(\d{1,2}):(\d{2}):(\d{2})', validate_and_convert_time, text)
|
| 179 |
+
text = re.sub(r'(\d{1,2}):(\d{2})', validate_and_convert_time, text)
|
| 180 |
+
text = re.sub(r'(\d{1,2})h(\d{2})', validate_and_convert_time, text)
|
| 181 |
+
text = re.sub(r'(\d{1,2})h\b', validate_and_convert_time, text)
|
| 182 |
+
|
| 183 |
+
return text
|
| 184 |
+
|
| 185 |
+
def _normalize_date(self, text):
|
| 186 |
+
"""Convert date notation to words with validation."""
|
| 187 |
+
|
| 188 |
+
def is_valid_date(day, month, year):
|
| 189 |
+
"""Check if date components are valid."""
|
| 190 |
+
day, month, year = int(day), int(month), int(year)
|
| 191 |
+
|
| 192 |
+
# Basic range checks
|
| 193 |
+
if not (1 <= day <= 31):
|
| 194 |
+
return False
|
| 195 |
+
if not (1 <= month <= 12):
|
| 196 |
+
return False
|
| 197 |
+
|
| 198 |
+
return True
|
| 199 |
+
|
| 200 |
+
def date_to_text(match):
|
| 201 |
+
day, month, year = match.groups()
|
| 202 |
+
if is_valid_date(day, month, year):
|
| 203 |
+
return f"ngày {day} tháng {month} năm {year}"
|
| 204 |
+
return match.group(0) # Return original if invalid
|
| 205 |
+
|
| 206 |
+
def date_iso_to_text(match):
|
| 207 |
+
year, month, day = match.groups()
|
| 208 |
+
if is_valid_date(day, month, year):
|
| 209 |
+
return f"ngày {day} tháng {month} năm {year}"
|
| 210 |
+
return match.group(0)
|
| 211 |
+
|
| 212 |
+
def date_short_year(match):
|
| 213 |
+
day, month, year = match.groups()
|
| 214 |
+
full_year = f"20{year}" if int(year) < 50 else f"19{year}"
|
| 215 |
+
if is_valid_date(day, month, full_year):
|
| 216 |
+
return f"ngày {day} tháng {month} năm {full_year}"
|
| 217 |
+
return match.group(0)
|
| 218 |
+
|
| 219 |
+
# Apply patterns with validation
|
| 220 |
+
text = re.sub(r'\bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b',
|
| 221 |
+
lambda m: date_to_text(m).replace('ngày ngày', 'ngày'), text)
|
| 222 |
+
text = re.sub(r'\bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b',
|
| 223 |
+
lambda m: date_short_year(m).replace('ngày ngày', 'ngày'), text)
|
| 224 |
+
text = re.sub(r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b', date_iso_to_text, text)
|
| 225 |
+
text = re.sub(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b', date_to_text, text)
|
| 226 |
+
text = re.sub(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b', date_short_year, text)
|
| 227 |
+
|
| 228 |
+
return text
|
| 229 |
+
|
| 230 |
+
def _normalize_phone(self, text):
|
| 231 |
+
"""Convert phone numbers to digit-by-digit reading."""
|
| 232 |
+
def phone_to_text(match):
|
| 233 |
+
phone = match.group(0)
|
| 234 |
+
phone = re.sub(r'[^\d]', '', phone)
|
| 235 |
+
|
| 236 |
+
if phone.startswith('84') and len(phone) >= 10:
|
| 237 |
+
phone = '0' + phone[2:]
|
| 238 |
+
|
| 239 |
+
if 10 <= len(phone) <= 11:
|
| 240 |
+
words = [self.digits[int(d)] for d in phone]
|
| 241 |
+
return ' '.join(words) + ' '
|
| 242 |
+
|
| 243 |
+
return match.group(0)
|
| 244 |
+
|
| 245 |
+
text = re.sub(r'(\+84|84)[\s\-\.]?\d[\d\s\-\.]{7,}', phone_to_text, text)
|
| 246 |
+
text = re.sub(r'\b0\d[\d\s\-\.]{8,}', phone_to_text, text)
|
| 247 |
+
return text
|
| 248 |
+
|
| 249 |
+
def _normalize_numbers(self, text):
|
| 250 |
+
text = re.sub(r'(\d+(?:[,.]\d+)?)%', lambda m: f'{m.group(1)} phần trăm', text)
|
| 251 |
+
# 1. Xóa dấu thousand separator trước
|
| 252 |
+
text = re.sub(r'(\d{1,3})(?:\.(\d{3}))+', lambda m: m.group(0).replace('.', ''), text)
|
| 253 |
+
|
| 254 |
+
# 2. Chuyển số thập phân thành chữ
|
| 255 |
+
def decimal_to_words(match):
|
| 256 |
+
whole = match.group(1)
|
| 257 |
+
decimal = match.group(2)
|
| 258 |
+
decimal_words = ' '.join([self.digits[int(d)] for d in decimal])
|
| 259 |
+
separator = 'phẩy' if ',' in match.group(0) else 'chấm'
|
| 260 |
+
return f"{whole} {separator} {decimal_words}"
|
| 261 |
+
|
| 262 |
+
# 2a. Dấu phẩy
|
| 263 |
+
text = re.sub(r'(\d+),(\d+)', decimal_to_words, text)
|
| 264 |
+
# 2b. Dấu chấm (1-2 chữ số thập phân)
|
| 265 |
+
text = re.sub(r'(\d+)\.(\d{1,2})\b', decimal_to_words, text)
|
| 266 |
+
|
| 267 |
+
return text
|
| 268 |
+
|
| 269 |
+
def _read_two_digits(self, n):
|
| 270 |
+
"""Read two-digit numbers in Vietnamese."""
|
| 271 |
+
if n < 10:
|
| 272 |
+
return self.digits[n]
|
| 273 |
+
elif n == 10:
|
| 274 |
+
return "mười"
|
| 275 |
+
elif n < 20:
|
| 276 |
+
if n == 15:
|
| 277 |
+
return "mười lăm"
|
| 278 |
+
return f"mười {self.digits[n % 10]}"
|
| 279 |
+
else:
|
| 280 |
+
tens = n // 10
|
| 281 |
+
ones = n % 10
|
| 282 |
+
if ones == 0:
|
| 283 |
+
return f"{self.digits[tens]} mươi"
|
| 284 |
+
elif ones == 1:
|
| 285 |
+
return f"{self.digits[tens]} mươi mốt"
|
| 286 |
+
elif ones == 5:
|
| 287 |
+
return f"{self.digits[tens]} mươi lăm"
|
| 288 |
+
else:
|
| 289 |
+
return f"{self.digits[tens]} mươi {self.digits[ones]}"
|
| 290 |
+
|
| 291 |
+
def _read_three_digits(self, n):
|
| 292 |
+
"""Read three-digit numbers in Vietnamese."""
|
| 293 |
+
if n < 100:
|
| 294 |
+
return self._read_two_digits(n)
|
| 295 |
+
|
| 296 |
+
hundreds = n // 100
|
| 297 |
+
remainder = n % 100
|
| 298 |
+
result = f"{self.digits[hundreds]} trăm"
|
| 299 |
+
|
| 300 |
+
if remainder == 0:
|
| 301 |
+
return result
|
| 302 |
+
elif remainder < 10:
|
| 303 |
+
result += f" lẻ {self.digits[remainder]}"
|
| 304 |
+
else:
|
| 305 |
+
result += f" {self._read_two_digits(remainder)}"
|
| 306 |
+
|
| 307 |
+
return result
|
| 308 |
+
|
| 309 |
+
def _convert_number_to_words(self, num):
|
| 310 |
+
"""Convert a number to Vietnamese words."""
|
| 311 |
+
if num == 0:
|
| 312 |
+
return "không"
|
| 313 |
+
|
| 314 |
+
if num < 0:
|
| 315 |
+
return f"âm {self._convert_number_to_words(-num)}"
|
| 316 |
+
|
| 317 |
+
if num >= 1000000000:
|
| 318 |
+
billion = num // 1000000000
|
| 319 |
+
remainder = num % 1000000000
|
| 320 |
+
result = f"{self._read_three_digits(billion)} tỷ"
|
| 321 |
+
if remainder > 0:
|
| 322 |
+
result += f" {self._convert_number_to_words(remainder)}"
|
| 323 |
+
return result
|
| 324 |
+
|
| 325 |
+
elif num >= 1000000:
|
| 326 |
+
million = num // 1000000
|
| 327 |
+
remainder = num % 1000000
|
| 328 |
+
result = f"{self._read_three_digits(million)} triệu"
|
| 329 |
+
if remainder > 0:
|
| 330 |
+
result += f" {self._convert_number_to_words(remainder)}"
|
| 331 |
+
return result
|
| 332 |
+
|
| 333 |
+
elif num >= 1000:
|
| 334 |
+
thousand = num // 1000
|
| 335 |
+
remainder = num % 1000
|
| 336 |
+
result = f"{self._read_three_digits(thousand)} nghìn"
|
| 337 |
+
if remainder > 0:
|
| 338 |
+
if remainder < 100:
|
| 339 |
+
result += f" không trăm {self._read_two_digits(remainder)}"
|
| 340 |
+
else:
|
| 341 |
+
result += f" {self._read_three_digits(remainder)}"
|
| 342 |
+
return result
|
| 343 |
+
|
| 344 |
+
else:
|
| 345 |
+
return self._read_three_digits(num)
|
| 346 |
+
|
| 347 |
+
def _number_to_words(self, text):
|
| 348 |
+
"""Convert all remaining numbers to words."""
|
| 349 |
+
def convert_number(match):
|
| 350 |
+
num = int(match.group(0))
|
| 351 |
+
return self._convert_number_to_words(num)
|
| 352 |
+
|
| 353 |
+
text = re.sub(r'\b\d+\b', convert_number, text)
|
| 354 |
+
return text
|
| 355 |
+
|
| 356 |
+
def _normalize_special_chars(self, text):
|
| 357 |
+
"""Handle special characters."""
|
| 358 |
+
text = text.replace('&', ' và ')
|
| 359 |
+
text = text.replace('+', ' cộng ')
|
| 360 |
+
text = text.replace('=', ' bằng ')
|
| 361 |
+
text = text.replace('#', ' thăng ')
|
| 362 |
+
text = re.sub(r'[\[\]\(\)\{\}]', ' ', text)
|
| 363 |
+
text = re.sub(r'\s+[-–—]+\s+', ' ', text)
|
| 364 |
+
text = re.sub(r'\.{2,}', ' ', text)
|
| 365 |
+
text = re.sub(r'\s+\.\s+', ' ', text)
|
| 366 |
+
text = re.sub(r'[^\w\sàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ.,!?;:@%]', ' ', text)
|
| 367 |
+
return text
|
| 368 |
+
|
| 369 |
+
def _normalize_whitespace(self, text):
|
| 370 |
+
"""Normalize whitespace."""
|
| 371 |
+
text = re.sub(r'\s+', ' ', text)
|
| 372 |
+
text = text.strip()
|
| 373 |
+
return text
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
if __name__ == "__main__":
|
| 377 |
+
normalizer = VietnameseTTSNormalizer()
|
| 378 |
+
|
| 379 |
+
test_texts = [
|
| 380 |
+
"Giá 2.500.000đ (giảm 50%), mua trước 14h30 ngày 15/12/2025",
|
| 381 |
+
"Liên hệ: 0912-345-678 hoặc email@example.com",
|
| 382 |
+
"Tốc độ 120km/h, trọng lượng 75kg",
|
| 383 |
+
"Nhiệt độ 36,5°C, độ ẩm 80%",
|
| 384 |
+
"Số pi = 3,14159",
|
| 385 |
+
"Giá trị tăng 2.5M, đạt 10B",
|
| 386 |
+
"Nhiệt độ -15°C vào mùa đông",
|
| 387 |
+
"Điện áp 220V, công suất 2.5kW, tần số 50Hz",
|
| 388 |
+
"Tôi đi lấy l nước về nhà",
|
| 389 |
+
"Cần 5l nước cho công thức này",
|
| 390 |
+
"Vận tốc ánh sáng 299792km/s",
|
| 391 |
+
"Mật độ dân số 450 người/km2",
|
| 392 |
+
"Công suất 100 W/m2",
|
| 393 |
+
"Hôm nay 2025-01-15",
|
| 394 |
+
"Gọi +84 912 345 678",
|
| 395 |
+
"Nhiệt độ 25°C lúc 14:30:45",
|
| 396 |
+
"Ngày 15/12/25",
|
| 397 |
+
"Giá 3.140.159",
|
| 398 |
+
]
|
| 399 |
+
|
| 400 |
+
print("=" * 80)
|
| 401 |
+
print("VIETNAMESE TTS NORMALIZATION TEST")
|
| 402 |
+
print("=" * 80)
|
| 403 |
+
|
| 404 |
+
for text in test_texts:
|
| 405 |
+
print(f"\n📝 Input: {text}")
|
| 406 |
+
normalized = normalizer.normalize(text)
|
| 407 |
+
print(f"🎵 Output: {normalized}")
|
| 408 |
+
print("-" * 80)
|
utils/phoneme_dict.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
utils/phonemize_text.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import platform
|
| 4 |
+
import glob
|
| 5 |
+
from phonemizer import phonemize
|
| 6 |
+
from phonemizer.backend.espeak.espeak import EspeakWrapper
|
| 7 |
+
from utils.normalize_text import VietnameseTTSNormalizer
|
| 8 |
+
|
| 9 |
+
# Configuration
|
| 10 |
+
PHONEME_DICT_PATH = os.getenv(
|
| 11 |
+
'PHONEME_DICT_PATH',
|
| 12 |
+
os.path.join(os.path.dirname(__file__), "phoneme_dict.json")
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
def load_phoneme_dict(path=PHONEME_DICT_PATH):
|
| 16 |
+
"""Load phoneme dictionary from JSON file."""
|
| 17 |
+
try:
|
| 18 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 19 |
+
return json.load(f)
|
| 20 |
+
except FileNotFoundError:
|
| 21 |
+
raise FileNotFoundError(
|
| 22 |
+
f"Phoneme dictionary not found at {path}. "
|
| 23 |
+
"Please create it or set PHONEME_DICT_PATH environment variable."
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def setup_espeak_library():
|
| 27 |
+
"""Configure eSpeak library path based on operating system."""
|
| 28 |
+
system = platform.system()
|
| 29 |
+
|
| 30 |
+
if system == "Windows":
|
| 31 |
+
_setup_windows_espeak()
|
| 32 |
+
elif system == "Linux":
|
| 33 |
+
_setup_linux_espeak()
|
| 34 |
+
elif system == "Darwin":
|
| 35 |
+
_setup_macos_espeak()
|
| 36 |
+
else:
|
| 37 |
+
raise OSError(
|
| 38 |
+
f"Unsupported OS: {system}. "
|
| 39 |
+
"Only Windows, Linux, and macOS are supported."
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def _setup_windows_espeak():
|
| 43 |
+
"""Setup eSpeak for Windows."""
|
| 44 |
+
default_path = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
|
| 45 |
+
if os.path.exists(default_path):
|
| 46 |
+
EspeakWrapper.set_library(default_path)
|
| 47 |
+
else:
|
| 48 |
+
raise FileNotFoundError(
|
| 49 |
+
f"eSpeak library not found at {default_path}. "
|
| 50 |
+
"Please install eSpeak NG from: https://github.com/espeak-ng/espeak-ng/releases"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
def _setup_linux_espeak():
|
| 54 |
+
"""Setup eSpeak for Linux."""
|
| 55 |
+
search_patterns = [
|
| 56 |
+
"/usr/lib/x86_64-linux-gnu/libespeak-ng.so*",
|
| 57 |
+
"/usr/lib/x86_64-linux-gnu/libespeak.so*",
|
| 58 |
+
"/usr/lib/libespeak-ng.so*",
|
| 59 |
+
"/usr/lib64/libespeak-ng.so*",
|
| 60 |
+
"/usr/local/lib/libespeak-ng.so*",
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
for pattern in search_patterns:
|
| 64 |
+
matches = glob.glob(pattern)
|
| 65 |
+
if matches:
|
| 66 |
+
EspeakWrapper.set_library(sorted(matches, key=len)[0])
|
| 67 |
+
return
|
| 68 |
+
|
| 69 |
+
raise RuntimeError(
|
| 70 |
+
"eSpeak NG library not found. Install with:\n"
|
| 71 |
+
" Ubuntu/Debian: sudo apt-get install espeak-ng\n"
|
| 72 |
+
" Fedora: sudo dnf install espeak-ng\n"
|
| 73 |
+
" Arch: sudo pacman -S espeak-ng\n"
|
| 74 |
+
"See: https://github.com/pnnbao97/VieNeu-TTS/issues/5"
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
def _setup_macos_espeak():
|
| 78 |
+
"""Setup eSpeak for macOS."""
|
| 79 |
+
espeak_lib = os.environ.get('PHONEMIZER_ESPEAK_LIBRARY')
|
| 80 |
+
|
| 81 |
+
paths_to_check = [
|
| 82 |
+
espeak_lib,
|
| 83 |
+
"/opt/homebrew/lib/libespeak-ng.dylib", # Apple Silicon
|
| 84 |
+
"/usr/local/lib/libespeak-ng.dylib", # Intel
|
| 85 |
+
"/opt/local/lib/libespeak-ng.dylib", # MacPorts
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
for path in paths_to_check:
|
| 89 |
+
if path and os.path.exists(path):
|
| 90 |
+
EspeakWrapper.set_library(path)
|
| 91 |
+
return
|
| 92 |
+
|
| 93 |
+
raise FileNotFoundError(
|
| 94 |
+
"eSpeak library not found. Install with:\n"
|
| 95 |
+
" brew install espeak-ng\n"
|
| 96 |
+
"Or set: export PHONEMIZER_ESPEAK_LIBRARY=/path/to/libespeak-ng.dylib"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Initialize
|
| 100 |
+
try:
|
| 101 |
+
setup_espeak_library()
|
| 102 |
+
phoneme_dict = load_phoneme_dict()
|
| 103 |
+
normalizer = VietnameseTTSNormalizer()
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"Initialization error: {e}")
|
| 106 |
+
raise
|
| 107 |
+
|
| 108 |
+
def phonemize_text(text: str) -> str:
|
| 109 |
+
"""Convert text to phonemes using phonemizer."""
|
| 110 |
+
text = normalizer.normalize(text)
|
| 111 |
+
return phonemize(
|
| 112 |
+
text,
|
| 113 |
+
language="vi",
|
| 114 |
+
backend="espeak",
|
| 115 |
+
preserve_punctuation=True,
|
| 116 |
+
with_stress=True,
|
| 117 |
+
language_switch="remove-flags"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
def phonemize_with_dict(text: str, phoneme_dict=phoneme_dict) -> str:
|
| 121 |
+
"""Phonemize text with dictionary lookup."""
|
| 122 |
+
text = normalizer.normalize(text)
|
| 123 |
+
words = text.split()
|
| 124 |
+
result = []
|
| 125 |
+
|
| 126 |
+
for word in words:
|
| 127 |
+
if word in phoneme_dict:
|
| 128 |
+
phone_word = phoneme_dict[word]
|
| 129 |
+
else:
|
| 130 |
+
try:
|
| 131 |
+
phone_word = phonemize(
|
| 132 |
+
word,
|
| 133 |
+
language='vi',
|
| 134 |
+
backend='espeak',
|
| 135 |
+
preserve_punctuation=True,
|
| 136 |
+
with_stress=True,
|
| 137 |
+
language_switch='remove-flags'
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if word.lower().startswith('r'):
|
| 141 |
+
phone_word = 'ɹ' + phone_word[1:]
|
| 142 |
+
|
| 143 |
+
phoneme_dict[word] = phone_word
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"Warning: Could not phonemize '{word}': {e}")
|
| 146 |
+
phone_word = word
|
| 147 |
+
|
| 148 |
+
result.append(phone_word)
|
| 149 |
+
|
| 150 |
+
return ' '.join(result)
|