IndexTTS-2-Demo

Paused

App Files Files Community

vasugo05 commited on Dec 27, 2025

Commit

89f1dfc

verified ·

1 Parent(s): 237b467

Update indextts/utils/front.py

Browse files

Files changed (1) hide show

indextts/utils/front.py +53 -0

indextts/utils/front.py CHANGED Viewed

@@ -7,6 +7,15 @@ import warnings
 from indextts.utils.common import tokenize_by_CJK_char, de_tokenized_by_CJK_char
 from sentencepiece import SentencePieceProcessor
 class TextNormalizer:
     def __init__(self):
@@ -59,6 +68,34 @@ class TextNormalizer:
         pattern = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
         return re.match(pattern, email) is not None
     PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
     """
     匹配拼音声调格式：pinyin+数字，声调1-5，5表示轻声
@@ -76,6 +113,10 @@ class TextNormalizer:
     def use_chinese(self, s):
         has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
         has_alpha = bool(re.search(r"[a-zA-Z]", s))
         is_email = self.match_email(s)
@@ -114,6 +155,18 @@ class TextNormalizer:
         if not self.zh_normalizer or not self.en_normalizer:
             print("Error, text normalizer is not initialized !!!")
             return ""
         if self.use_chinese(text):
             text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
             replaced_text, pinyin_list = self.save_pinyin_tones(text.rstrip())

 from indextts.utils.common import tokenize_by_CJK_char, de_tokenized_by_CJK_char
 from sentencepiece import SentencePieceProcessor
+# Hindi/Devanagari support
+try:
+    from indic_transliteration import sanscript
+    from indic_transliteration.sanscript import transliterate
+    HINDI_SUPPORT = True
+except ImportError:
+    HINDI_SUPPORT = False
+    print("Warning: indic-transliteration not installed. Hindi support disabled.")
 class TextNormalizer:
     def __init__(self):
         pattern = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
         return re.match(pattern, email) is not None
+    def is_devanagari(self, text):
+        """Check if text contains Devanagari script (Hindi, Sanskrit, Marathi, Nepali, etc.)"""
+        # Devanagari Unicode range: U+0900 to U+097F
+        return bool(re.search(r"[\u0900-\u097F]", text))
+    def transliterate_devanagari_to_phoneme(self, text):
+        """Transliterate Devanagari script to romanized phonemes for TTS processing"""
+        if not HINDI_SUPPORT:
+            print("Warning: Hindi transliteration not available. Install 'indic-transliteration' package.")
+            return text
+        # Transliterate Devanagari to ITRANS (phoneme-friendly romanization)
+        # ITRANS is better for TTS as it preserves phonetic information
+        romanized = transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS)
+        # Clean up ITRANS output for better TTS compatibility
+        # Convert to lowercase and handle special markers
+        romanized = romanized.replace("~", "n")  # Nasalization markers
+        romanized = romanized.replace("^", "")   # Remove accent markers
+        romanized = romanized.replace("M", "m")  # Normalize anusvara
+        romanized = romanized.replace("H", "h")  # Normalize visarga
+        # Apply character replacement map to romanized text
+        pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys()))
+        romanized = pattern.sub(lambda x: self.char_rep_map[x.group()], romanized)
+        return romanized
     PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
     """
     匹配拼音声调格式：pinyin+数字，声调1-5，5表示轻声
     def use_chinese(self, s):
+        # First check if it's Devanagari script (Hindi, etc.)
+        if self.is_devanagari(s):
+            return False  # Don't use Chinese normalizer for Hindi
         has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
         has_alpha = bool(re.search(r"[a-zA-Z]", s))
         is_email = self.match_email(s)
         if not self.zh_normalizer or not self.en_normalizer:
             print("Error, text normalizer is not initialized !!!")
             return ""
+        # Handle Devanagari/Hindi text first
+        if self.is_devanagari(text):
+            print(">> Detected Devanagari script, applying Hindi transliteration...")
+            result = self.transliterate_devanagari_to_phoneme(text)
+            # After transliteration, treat as English for further normalization
+            try:
+                result = self.en_normalizer.normalize(result)
+            except Exception:
+                print(traceback.format_exc())
+            return result
         if self.use_chinese(text):
             text = re.sub(TextNormalizer.ENGLISH_CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
             replaced_text, pinyin_list = self.save_pinyin_tones(text.rstrip())