IndicF5

Runtime error

App Files Files Community

high77 commited on Dec 5, 2025

Commit

ee96f4d

verified ·

1 Parent(s): 7939a73

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -75

app.py CHANGED Viewed

@@ -8,11 +8,20 @@ import gradio as gr
 import soundfile as sf
 from transformers import AutoModel
 from typing import Tuple
-# ---------- LANGUAGE DETECTION (11 INDIAN LANGUAGES ONLY) ----------
 def detect_language_from_text(text: str) -> str:
-    """Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te."""
-    # 11 Indian scripts – Latin (English) is **not** included
     scripts = {
         'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
         'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
@@ -30,37 +39,26 @@ def detect_language_from_text(text: str) -> str:
     for lang, chars in scripts.items():
         if txt & chars:
             return lang
-    # Default to Hindi (most data) if nothing matches
     return 'hi'
-# ---------- PROSODY NEUTRALISER (GPU MEMORY LIGHT) ----------
-class ProsodyNeutraliser:
-    def __init__(self):
-        self.sr = 24_000
-    def neutralise_prosody(self, audio: np.ndarray, src_sr: int) -> Tuple[int, np.ndarray]:
-        """Flatten prosody (speaker voice kept)."""
-        if audio.dtype != np.float32:
-            audio = audio.astype(np.float32)
-        if src_sr != self.sr:
-            import torchaudio
-            audio = torchaudio.functional.resample(torch.from_numpy(audio), src_sr, self.sr).numpy()
-        # very light pitch/energy flattening
-        f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
-                                          fmax=librosa.note_to_hz('C7'), sr=self.sr)
-        mask = ~np.isnan(f0)
-        if mask.sum() > 2:
-            f0_interp = np.interp(np.arange(len(f0)), np.where(mask)[0], f0[mask])
-            from scipy.ndimage import gaussian_filter1d
-            f0_smooth = gaussian_filter1d(f0_interp, sigma=7)
-            audio = self._flatten_energy(audio)
-        return self.sr, audio
-    def _flatten_energy(self, audio: np.ndarray) -> np.ndarray:
-        rms = librosa.feature.rms(y=audio, hop_length=512)[0]
-        rms_mean = rms.mean()
-        rms_flat = np.clip(rms, rms_mean * 0.6, rms_mean * 1.4)
-        return audio * np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(rms)), rms_flat / rms)
 # Function to load reference audio from URL
 def load_audio_from_url(url):
@@ -72,27 +70,45 @@ def load_audio_from_url(url):
 @spaces.GPU
 def synthesize_speech(text, ref_audio, ref_text):
-    if ref_audio is None or ref_text.strip() == "":
-        return "Error: Please provide a reference audio and its corresponding text."
-    # Ensure valid reference audio input
     if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
         sample_rate, audio_data = ref_audio
     else:
-        return "Error: Invalid reference audio input."
-    # Save reference audio directly without resampling
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
         sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
         temp_audio.flush()
-    audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
-    # Normalize output and save
     if audio.dtype == np.int16:
         audio = audio.astype(np.float32) / 32768.0
-    return 24000, audio
 # Load TTS model
@@ -102,7 +118,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Device", device)
 model = model.to(device)
-# ---------- PRE-FETCH EXAMPLES (ONLY ODIA SYNTH TEXT) ----------
 EXAMPLES = [
     {
         "audio_name": "PAN_F (Happy)",
@@ -116,24 +132,6 @@ EXAMPLES = [
         "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
         "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
     },
-    {
-        "audio_name": "MAR_F (WIKI)",
-        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
-        "ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
-        "synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
-    },
-    {
-        "audio_name": "MAR_M (WIKI)",
-        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
-        "ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
-        "synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
-    },
-    {
-        "audio_name": "KAN_F (Happy)",
-        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
-        "ref_text": "ನಮ್‌ ಫ್ರಿಜ್ಜಲ್ಲಿ  ಕೂలಿಂಗ್‌ ಸమಸ്യೆ ಆಗಿ ನಾನ್‌ ಭಾಳ ದినದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆకానిక್ ಆಗಿರೋ ನిమ್‌ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬోదು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
-        "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
-    },
 ]
 # Preload all example audios
@@ -143,29 +141,29 @@ for example in EXAMPLES:
     example["audio_data"] = audio_data
-# Define Gradio interface with layout adjustments
 with gr.Blocks() as iface:
     gr.Markdown(
         """
-        # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
-        [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
-        We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
-        IndicF5 supports **11 Indian languages**:
-        **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
-        Generate speech using a reference prompt audio and its corresponding text.
         """
     )
     with gr.Row():
         with gr.Column():
-            text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
-            ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
-            ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
             submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
         with gr.Column():
-            output_audio = gr.Audio(label="Generated Speech", type="numpy")
     # Add multiple examples
     examples = [
@@ -175,10 +173,14 @@ with gr.Blocks() as iface:
     gr.Examples(
         examples=examples,
         inputs=[text_input, ref_audio_input, ref_text_input],
-        label="Choose an example:"
     )
-    submit_btn.click(synthesize_speech, inputs=[text_input, ref_audio_input, ref_text_input], outputs=[output_audio])
 iface.launch(share=True)

 import soundfile as sf
 from transformers import AutoModel
 from typing import Tuple
+import uuid
+import os
+# ---------- LANGUAGE DETECTION (UPDATED TO ALLOW ENGLISH) ----------
 def detect_language_from_text(text: str) -> str:
+    """Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'."""
+    # 1. Check for English (Latin Script) first
+    latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
+    text_chars = set(text)
+    # If text has significant Latin characters, treat as English
+    if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3:
+        return "en"
+    # 2. Check Indian scripts
     scripts = {
         'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
         'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
     for lang, chars in scripts.items():
         if txt & chars:
             return lang
+    # Default to Hindi if nothing matches
     return 'hi'
+# ---------- TEXT PACER (HELPS PREVENT SKIPPING) ----------
+def slow_down_text(text):
+    """
+    Adds pauses to force the model to take its time processing complex scripts.
+    """
+    if not text:
+        return ""
+    # Add a comma (pause) after every 3 words to force a breather
+    words = text.split()
+    paced_text = ""
+    for i, word in enumerate(words):
+        paced_text += word + " "
+        if (i + 1) % 3 == 0:
+            paced_text += ", "
+    # Add padding at start/end
+    return f". . . {paced_text} . . ."
 # Function to load reference audio from URL
 def load_audio_from_url(url):
 @spaces.GPU
 def synthesize_speech(text, ref_audio, ref_text):
+    # 1. Basic Validation
+    if ref_audio is None:
+        raise gr.Error("Please upload a Reference Audio file.")
+    if ref_text.strip() == "":
+        raise gr.Error("Please enter the text transcript for the Reference Audio.")
+    if text.strip() == "":
+        raise gr.Error("Please enter the text you want to generate.")
+    # 2. Reference Audio Processing
     if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
         sample_rate, audio_data = ref_audio
     else:
+        raise gr.Error("Invalid reference audio input.")
+    # Save reference audio to temp file
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
         sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
         temp_audio.flush()
+    # 3. Apply Text Pacing (The "Skipping" Fix)
+    safe_text = slow_down_text(text)
+    # 4. Generate Audio
+    # Note: We are using safe_text for generation
+    audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text)
+    # 5. Normalize Output
     if audio.dtype == np.int16:
         audio = audio.astype(np.float32) / 32768.0
+    # 6. Save Output to File (The "Download" Fix)
+    # We save the generated audio to a file so we can provide a download link
+    output_filename = f"generated_{uuid.uuid4().hex}.wav"
+    output_path = os.path.join(tempfile.gettempdir(), output_filename)
+    sf.write(output_path, audio, 24000)
+    # Return the file path twice: once for the player, once for the download button
+    return output_path, output_path
 # Load TTS model
 print("Device", device)
 model = model.to(device)
+# ---------- PRE-FETCH EXAMPLES ----------
 EXAMPLES = [
     {
         "audio_name": "PAN_F (Happy)",
         "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
         "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
     },
 ]
 # Preload all example audios
     example["audio_data"] = audio_data
+# Define Gradio interface
 with gr.Blocks() as iface:
     gr.Markdown(
         """
+        # **IndicF5 Dubbing Studio**
+        **Instructions for Best Results:**
+        1. **Reference Audio:** Use a clear, 10-15 second clip. Slower speech works better.
+        2. **Reference Text:** Must match the audio exactly.
+        3. **Target Text:** Odia works best with punctuation. If it skips words, add commas.
         """
     )
     with gr.Row():
         with gr.Column():
+            text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3)
+            ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)")
+            ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2)
             submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
         with gr.Column():
+            output_audio = gr.Audio(label="Play Generated Speech", type="filepath")
+            # This is the dedicated download button
+            output_file = gr.File(label="Download Audio File", file_count="single")
     # Add multiple examples
     examples = [
     gr.Examples(
         examples=examples,
         inputs=[text_input, ref_audio_input, ref_text_input],
+        label="Quick Examples"
     )
+    # When clicked, return audio to Player AND File Downloader
+    submit_btn.click(
+        synthesize_speech,
+        inputs=[text_input, ref_audio_input, ref_text_input],
+        outputs=[output_audio, output_file]
+    )
 iface.launch(share=True)