Spaces:
Paused
Paused
Update indextts/utils/front.py
Browse files- indextts/utils/front.py +53 -0
indextts/utils/front.py
CHANGED
|
@@ -7,6 +7,15 @@ import warnings
|
|
| 7 |
from indextts.utils.common import tokenize_by_CJK_char, de_tokenized_by_CJK_char
|
| 8 |
from sentencepiece import SentencePieceProcessor
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class TextNormalizer:
|
| 12 |
def __init__(self):
|
|
@@ -59,6 +68,34 @@ class TextNormalizer:
|
|
| 59 |
pattern = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
|
| 60 |
return re.match(pattern, email) is not None
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
|
| 63 |
"""
|
| 64 |
匹配拼音声调格式:pinyin+数字,声调1-5,5表示轻声
|
|
@@ -76,6 +113,10 @@ class TextNormalizer:
|
|
| 76 |
|
| 77 |
|
| 78 |
def use_chinese(self, s):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
|
| 80 |
has_alpha = bool(re.search(r"[a-zA-Z]", s))
|
| 81 |
is_email = self.match_email(s)
|
|
@@ -114,6 +155,18 @@ class TextNormalizer:
|
|
| 114 |
if not self.zh_normalizer or not self.en_normalizer:
|
| 115 |
print("Error, text normalizer is not initialized !!!")
|
| 116 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
if self.use_chinese(text):
|
| 118 |
text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
|
| 119 |
replaced_text, pinyin_list = self.save_pinyin_tones(text.rstrip())
|
|
|
|
| 7 |
from indextts.utils.common import tokenize_by_CJK_char, de_tokenized_by_CJK_char
|
| 8 |
from sentencepiece import SentencePieceProcessor
|
| 9 |
|
| 10 |
+
# Hindi/Devanagari support
|
| 11 |
+
try:
|
| 12 |
+
from indic_transliteration import sanscript
|
| 13 |
+
from indic_transliteration.sanscript import transliterate
|
| 14 |
+
HINDI_SUPPORT = True
|
| 15 |
+
except ImportError:
|
| 16 |
+
HINDI_SUPPORT = False
|
| 17 |
+
print("Warning: indic-transliteration not installed. Hindi support disabled.")
|
| 18 |
+
|
| 19 |
|
| 20 |
class TextNormalizer:
|
| 21 |
def __init__(self):
|
|
|
|
| 68 |
pattern = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
|
| 69 |
return re.match(pattern, email) is not None
|
| 70 |
|
| 71 |
+
def is_devanagari(self, text):
|
| 72 |
+
"""Check if text contains Devanagari script (Hindi, Sanskrit, Marathi, Nepali, etc.)"""
|
| 73 |
+
# Devanagari Unicode range: U+0900 to U+097F
|
| 74 |
+
return bool(re.search(r"[\u0900-\u097F]", text))
|
| 75 |
+
|
| 76 |
+
def transliterate_devanagari_to_phoneme(self, text):
|
| 77 |
+
"""Transliterate Devanagari script to romanized phonemes for TTS processing"""
|
| 78 |
+
if not HINDI_SUPPORT:
|
| 79 |
+
print("Warning: Hindi transliteration not available. Install 'indic-transliteration' package.")
|
| 80 |
+
return text
|
| 81 |
+
|
| 82 |
+
# Transliterate Devanagari to ITRANS (phoneme-friendly romanization)
|
| 83 |
+
# ITRANS is better for TTS as it preserves phonetic information
|
| 84 |
+
romanized = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
|
| 85 |
+
|
| 86 |
+
# Clean up ITRANS output for better TTS compatibility
|
| 87 |
+
# Convert to lowercase and handle special markers
|
| 88 |
+
romanized = romanized.replace("~", "n") # Nasalization markers
|
| 89 |
+
romanized = romanized.replace("^", "") # Remove accent markers
|
| 90 |
+
romanized = romanized.replace("M", "m") # Normalize anusvara
|
| 91 |
+
romanized = romanized.replace("H", "h") # Normalize visarga
|
| 92 |
+
|
| 93 |
+
# Apply character replacement map to romanized text
|
| 94 |
+
pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys()))
|
| 95 |
+
romanized = pattern.sub(lambda x: self.char_rep_map[x.group()], romanized)
|
| 96 |
+
|
| 97 |
+
return romanized
|
| 98 |
+
|
| 99 |
PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
|
| 100 |
"""
|
| 101 |
匹配拼音声调格式:pinyin+数字,声调1-5,5表示轻声
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
def use_chinese(self, s):
|
| 116 |
+
# First check if it's Devanagari script (Hindi, etc.)
|
| 117 |
+
if self.is_devanagari(s):
|
| 118 |
+
return False # Don't use Chinese normalizer for Hindi
|
| 119 |
+
|
| 120 |
has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
|
| 121 |
has_alpha = bool(re.search(r"[a-zA-Z]", s))
|
| 122 |
is_email = self.match_email(s)
|
|
|
|
| 155 |
if not self.zh_normalizer or not self.en_normalizer:
|
| 156 |
print("Error, text normalizer is not initialized !!!")
|
| 157 |
return ""
|
| 158 |
+
|
| 159 |
+
# Handle Devanagari/Hindi text first
|
| 160 |
+
if self.is_devanagari(text):
|
| 161 |
+
print(">> Detected Devanagari script, applying Hindi transliteration...")
|
| 162 |
+
result = self.transliterate_devanagari_to_phoneme(text)
|
| 163 |
+
# After transliteration, treat as English for further normalization
|
| 164 |
+
try:
|
| 165 |
+
result = self.en_normalizer.normalize(result)
|
| 166 |
+
except Exception:
|
| 167 |
+
print(traceback.format_exc())
|
| 168 |
+
return result
|
| 169 |
+
|
| 170 |
if self.use_chinese(text):
|
| 171 |
text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
|
| 172 |
replaced_text, pinyin_list = self.save_pinyin_tones(text.rstrip())
|