IndicF5

Runtime error

App Files Files Community

high77 commited on Nov 18, 2025

Commit

8ede049

verified ·

1 Parent(s): b964169

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -40

app.py CHANGED Viewed

@@ -34,9 +34,12 @@ class ProsodyNeutraliser:
     def neutralise_prosody(self, audio: np.ndarray, src_sr: int) -> Tuple[int, np.ndarray]:
         """Return audio with flattened prosody (speaker voice kept)."""
         if src_sr != self.sr:
             audio = librosa.resample(audio, orig_sr=src_sr, target_sr=self.sr)
-        # Simple but effective: flatten pitch contour → no Hindi/English intonation left
         f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
                                           fmax=librosa.note_to_hz('C7'), sr=self.sr)
         mask = ~np.isnan(f0)
@@ -44,16 +47,14 @@ class ProsodyNeutraliser:
             f0_interp = np.interp(np.arange(len(f0)), np.where(mask)[0], f0[mask])
             from scipy.ndimage import gaussian_filter1d
             f0_smooth = gaussian_filter1d(f0_interp, sigma=7)  # flatten
-            # Replace smooth F0 back (phase-safe approximation)
-            audio = self._flatten_energy(audio)  # optional: also flatten energy
         return self.sr, audio
     def _flatten_energy(self, audio: np.ndarray) -> np.ndarray:
-        # Very light energy flattening (keeps naturalness)
         rms = librosa.feature.rms(y=audio, hop_length=512)[0]
         rms_mean = rms.mean()
         rms_flat = np.clip(rms, rms_mean * 0.6, rms_mean * 1.4)
-        # Simple resynthesis (good enough for TTS input)
         return audio * np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(rms)), rms_flat / rms)
 # ---------- AUDIO LOADER ----------
@@ -66,11 +67,15 @@ def load_audio_from_url(url: str) -> Tuple[int, np.ndarray]:
 # ---------- MAIN SYNTHESIS ----------
 @spaces.GPU
-def synthesise_speech(text: str, ref_audio: Tuple[int, np.ndarray], ref_text: str):
     if ref_audio is None or not ref_text.strip():
         return "Error: reference audio + transcript required."
     src_sr, audio = ref_audio
     tgt_lang = detect_language_from_text(text)
     ref_lang = detect_language_from_text(ref_text)
@@ -91,62 +96,80 @@ def synthesise_speech(text: str, ref_audio: Tuple[int, np.ndarray], ref_text: st
     if out.dtype == np.int16:
         out = out.astype(np.float32) / 32768.0
-    return 24_000, out
 # ---------- LOAD MODEL ----------
-repo_id = "ai4Bharat/IndicF5"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = AutoModel.from_pretrained(repo_id, trust_remote_code=True).to(device)
-# ---------- PRE-FETCH EXAMPLES (GRADIO 4-SAFE) ----------
 EXAMPLES = [
     {
-        "audio_name": "ODIA_F (Neutral)",
-        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/ORI_F_WIKI_00001.wav",
-        "ref_text": "ଓଡ଼ିଶା ରାଜ୍ୟର ଭୌଗୋଳିକ ଅବସ୍ଥିତି ଏହାର ଜନସାଧାରଣଙ୍କ ଜୀବନଶୈଳୀ ଉପରେ ପ୍ରଭାବ ପକାଉଛି।",
         "synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
     },
     {
-        "audio_name": "ODIA_M (News)",
-        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/ORI_M_NEWS_00001.wav",
-        "ref_text": "କୋଭିଡ ମହାମାରୀ ସମୟରେ ଓଡ଼ିଶା ସରକାର ବିଭିନ୍ନ ପଦକ୍ଷେପ ନେଇଥିଲେ।",
         "synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
     },
     {
-        "audio_name": "PAN_F (Happy)",
-        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
-        "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮਿਸਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
-        "synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
     },
 ]
-# download only valid audio
-for ex in EXAMPLES:
-    sr, data = load_audio_from_url(ex["audio_url"])
-    ex["sample_rate"] = sr if sr is not None else 24_000
-    ex["audio_data"]  = data if data is not None and len(data) > 0 else np.zeros(1_000)  # small dummy
-# build Gradio examples list (never contains None or zero-length audio)
-examples = []
-for ex in EXAMPLES:
-    if ex["audio_data"] is not None and len(ex["audio_data"]) > 0:
-        examples.append([ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]])
 # ---------- GRADIO UI ----------
 with gr.Blocks() as iface:
-    gr.Markdown("""# IndicF5 TTS – Odia-prosody fixed""")
     with gr.Row():
         with gr.Column():
-            text = gr.Textbox(label="Text to Synthesise", lines=3, placeholder="Enter Odia, Hindi, or any text…")
-            ref_audio = gr.Audio(type="numpy", label="Reference Prompt Audio")
-            ref_text = gr.Textbox(label="Text in Reference Prompt Audio", lines=2)
-            btn = gr.Button("🎤 Generate Speech", variant="primary")
         with gr.Column():
-            out_audio = gr.Audio(label="Generated Speech", type="numpy")
-    if examples:  # only show if we have valid ones
-        gr.Examples(examples=examples, inputs=[text, ref_audio, ref_text], label="Pick an example:")
-    btn.click(synthesise_speech, inputs=[text, ref_audio, ref_text], outputs=[out_audio])
 iface.launch()

     def neutralise_prosody(self, audio: np.ndarray, src_sr: int) -> Tuple[int, np.ndarray]:
         """Return audio with flattened prosody (speaker voice kept)."""
+        # Ensure float32 for librosa
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
         if src_sr != self.sr:
             audio = librosa.resample(audio, orig_sr=src_sr, target_sr=self.sr)
+        # Flatten pitch contour → no Hindi/English intonation left
         f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
                                           fmax=librosa.note_to_hz('C7'), sr=self.sr)
         mask = ~np.isnan(f0)
             f0_interp = np.interp(np.arange(len(f0)), np.where(mask)[0], f0[mask])
             from scipy.ndimage import gaussian_filter1d
             f0_smooth = gaussian_filter1d(f0_interp, sigma=7)  # flatten
+            # Light energy flattening
+            audio = self._flatten_energy(audio)
         return self.sr, audio
     def _flatten_energy(self, audio: np.ndarray) -> np.ndarray:
         rms = librosa.feature.rms(y=audio, hop_length=512)[0]
         rms_mean = rms.mean()
         rms_flat = np.clip(rms, rms_mean * 0.6, rms_mean * 1.4)
         return audio * np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(rms)), rms_flat / rms)
 # ---------- AUDIO LOADER ----------
 # ---------- MAIN SYNTHESIS ----------
 @spaces.GPU
+def synthesize_speech(text: str, ref_audio: Tuple[int, np.ndarray], ref_text: str):
     if ref_audio is None or not ref_text.strip():
         return "Error: reference audio + transcript required."
     src_sr, audio = ref_audio
+    # Ensure float32
+    if audio.dtype != np.float32:
+        audio = audio.astype(np.float32)
     tgt_lang = detect_language_from_text(text)
     ref_lang = detect_language_from_text(ref_text)
     if out.dtype == np.int16:
         out = out.astype(np.float32) / 32768.0
+    return 24000, out
 # ---------- LOAD MODEL ----------
+repo_id = "ai4bharat/IndicF5"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = AutoModel.from_pretrained(repo_id, trust_remote_code=True).to(device)
+# ---------- PRE-FETCH EXAMPLES (ONLY ODIA SYNTH TEXT) ----------
 EXAMPLES = [
     {
+        "audio_name": "PAN_F (Happy)",
+        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
+        "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮਿਸਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
+        "synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
+    },
+    {
+        "audio_name": "TAM_F (Happy)",
+        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
+        "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
+        "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
+    },
+    {
+        "audio_name": "MAR_F (WIKI)",
+        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
+        "ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
         "synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
     },
     {
+        "audio_name": "MAR_M (WIKI)",
+        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
+        "ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
         "synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
     },
     {
+        "audio_name": "KAN_F (Happy)",
+        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
+        "ref_text": "ನಮ್‌ ಫ್ರಿಜ್ಜಲ್ಲಿ  ಕೂಲಿಂಗ್‌ ಸಮಸ್ಯೆ ಆಗಿ ನಾನ್‌ ಭಾಳ ದಿನದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆಕಾನಿಕ್ ಆಗಿರೋ ನಿಮ್‌ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬೋದು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
+        "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
     },
 ]
+# Preload all example audios (skip broken ones)
+for example in EXAMPLES:
+    sample_rate, audio_data = load_audio_from_url(example["audio_url"])
+    example["sample_rate"] = sample_rate if sample_rate is not None else 24_000
+    example["audio_data"]  = audio_data if audio_data is not None and len(audio_data) > 0 else np.zeros(1_000)
+# Gradio 4.x compatible examples
+examples = [[ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]] for ex in EXAMPLES]
 # ---------- GRADIO UI ----------
 with gr.Blocks() as iface:
+    gr.Markdown(
+        """
+        # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
+        [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
+        We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
+        IndicF5 supports **11 Indian languages**:
+        **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
+        Generate speech using a reference prompt audio and its corresponding text.
+        """
+    )
     with gr.Row():
         with gr.Column():
+            text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
+            ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
+            ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
+            submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
         with gr.Column():
+            output_audio = gr.Audio(label="Generated Speech", type="numpy")
+    gr.Examples(examples=examples, inputs=[text_input, ref_audio_input, ref_text_input], label="Choose an example:")
+    submit_btn.click(synthesize_speech, inputs=[text_input, ref_audio_input, ref_text_input], outputs=[output_audio])
 iface.launch()