vasugo05 commited on
Commit
89f1dfc
·
verified ·
1 Parent(s): 237b467

Update indextts/utils/front.py

Browse files
Files changed (1) hide show
  1. indextts/utils/front.py +53 -0
indextts/utils/front.py CHANGED
@@ -7,6 +7,15 @@ import warnings
7
  from indextts.utils.common import tokenize_by_CJK_char, de_tokenized_by_CJK_char
8
  from sentencepiece import SentencePieceProcessor
9
 
 
 
 
 
 
 
 
 
 
10
 
11
  class TextNormalizer:
12
  def __init__(self):
@@ -59,6 +68,34 @@ class TextNormalizer:
59
  pattern = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
60
  return re.match(pattern, email) is not None
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
63
  """
64
  匹配拼音声调格式:pinyin+数字,声调1-5,5表示轻声
@@ -76,6 +113,10 @@ class TextNormalizer:
76
 
77
 
78
  def use_chinese(self, s):
 
 
 
 
79
  has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
80
  has_alpha = bool(re.search(r"[a-zA-Z]", s))
81
  is_email = self.match_email(s)
@@ -114,6 +155,18 @@ class TextNormalizer:
114
  if not self.zh_normalizer or not self.en_normalizer:
115
  print("Error, text normalizer is not initialized !!!")
116
  return ""
 
 
 
 
 
 
 
 
 
 
 
 
117
  if self.use_chinese(text):
118
  text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
119
  replaced_text, pinyin_list = self.save_pinyin_tones(text.rstrip())
 
7
  from indextts.utils.common import tokenize_by_CJK_char, de_tokenized_by_CJK_char
8
  from sentencepiece import SentencePieceProcessor
9
 
10
+ # Hindi/Devanagari support
11
+ try:
12
+ from indic_transliteration import sanscript
13
+ from indic_transliteration.sanscript import transliterate
14
+ HINDI_SUPPORT = True
15
+ except ImportError:
16
+ HINDI_SUPPORT = False
17
+ print("Warning: indic-transliteration not installed. Hindi support disabled.")
18
+
19
 
20
  class TextNormalizer:
21
  def __init__(self):
 
68
  pattern = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
69
  return re.match(pattern, email) is not None
70
 
71
+ def is_devanagari(self, text):
72
+ """Check if text contains Devanagari script (Hindi, Sanskrit, Marathi, Nepali, etc.)"""
73
+ # Devanagari Unicode range: U+0900 to U+097F
74
+ return bool(re.search(r"[\u0900-\u097F]", text))
75
+
76
+ def transliterate_devanagari_to_phoneme(self, text):
77
+ """Transliterate Devanagari script to romanized phonemes for TTS processing"""
78
+ if not HINDI_SUPPORT:
79
+ print("Warning: Hindi transliteration not available. Install 'indic-transliteration' package.")
80
+ return text
81
+
82
+ # Transliterate Devanagari to ITRANS (phoneme-friendly romanization)
83
+ # ITRANS is better for TTS as it preserves phonetic information
84
+ romanized = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
85
+
86
+ # Clean up ITRANS output for better TTS compatibility
87
+ # Convert to lowercase and handle special markers
88
+ romanized = romanized.replace("~", "n") # Nasalization markers
89
+ romanized = romanized.replace("^", "") # Remove accent markers
90
+ romanized = romanized.replace("M", "m") # Normalize anusvara
91
+ romanized = romanized.replace("H", "h") # Normalize visarga
92
+
93
+ # Apply character replacement map to romanized text
94
+ pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys()))
95
+ romanized = pattern.sub(lambda x: self.char_rep_map[x.group()], romanized)
96
+
97
+ return romanized
98
+
99
  PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
100
  """
101
  匹配拼音声调格式:pinyin+数字,声调1-5,5表示轻声
 
113
 
114
 
115
  def use_chinese(self, s):
116
+ # First check if it's Devanagari script (Hindi, etc.)
117
+ if self.is_devanagari(s):
118
+ return False # Don't use Chinese normalizer for Hindi
119
+
120
  has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
121
  has_alpha = bool(re.search(r"[a-zA-Z]", s))
122
  is_email = self.match_email(s)
 
155
  if not self.zh_normalizer or not self.en_normalizer:
156
  print("Error, text normalizer is not initialized !!!")
157
  return ""
158
+
159
+ # Handle Devanagari/Hindi text first
160
+ if self.is_devanagari(text):
161
+ print(">> Detected Devanagari script, applying Hindi transliteration...")
162
+ result = self.transliterate_devanagari_to_phoneme(text)
163
+ # After transliteration, treat as English for further normalization
164
+ try:
165
+ result = self.en_normalizer.normalize(result)
166
+ except Exception:
167
+ print(traceback.format_exc())
168
+ return result
169
+
170
  if self.use_chinese(text):
171
  text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
172
  replaced_text, pinyin_list = self.save_pinyin_tones(text.rstrip())