Spaces:

ai4bharat
/

IndicF5

Running on Zero

App Files Files Community

Fix transformers 5.0.0 compatibility

by zrini2005 - opened Feb 6

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+58

-27

Files changed (2) hide show

app.py +57 -26
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -19,34 +19,61 @@ def load_audio_from_url(url):
 @spaces.GPU
 def synthesize_speech(text, ref_audio, ref_text):
-    if ref_audio is None or ref_text.strip() == "":
-        return "Error: Please provide a reference audio and its corresponding text."
-    # Ensure valid reference audio input
-    if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
-        sample_rate, audio_data = ref_audio
-    else:
-        return "Error: Invalid reference audio input."
-    # Save reference audio directly without resampling
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
-        sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
-        temp_audio.flush()
-    audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
-    # Normalize output and save
-    if audio.dtype == np.int16:
-        audio = audio.astype(np.float32) / 32768.0
-    return 24000, audio
-# Load TTS model
 repo_id = "ai4bharat/IndicF5"
-model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Device", device)
 model = model.to(device)
 # Example Data (Multiple Examples)
@@ -87,6 +114,13 @@ EXAMPLES = [
 # Preload all example audios
 for example in EXAMPLES:
     sample_rate, audio_data = load_audio_from_url(example["audio_url"])
     example["sample_rate"] = sample_rate
     example["audio_data"] = audio_data
@@ -96,11 +130,8 @@ with gr.Blocks() as iface:
     gr.Markdown(
         """
         # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
         [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
         We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
         IndicF5 supports **11 Indian languages**:
         **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
@@ -111,7 +142,7 @@ with gr.Blocks() as iface:
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
-            ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
             ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
             submit_btn = gr.Button("🎤 Generate Speech", variant="primary")

 @spaces.GPU
 def synthesize_speech(text, ref_audio, ref_text):
+    try:
+        if ref_audio is None or ref_text.strip() == "":
+            return "Error: Please provide a reference audio and its corresponding text."
+        # Ensure valid reference audio input
+        if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
+            sample_rate, audio_data = ref_audio
+        else:
+            return "Error: Invalid reference audio input."
+        # Save reference audio directly without resampling
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+            sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
+            temp_audio.flush()
+        audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
+        # Validate audio output
+        if audio is None or (isinstance(audio, np.ndarray) and audio.size == 0):
+            print("Error: Model returned empty audio")
+            return None
+        #print(f"DEBUG: audio dtype={audio.dtype}, shape={audio.shape}, min={audio.min()}, max={audio.max()}")
+        # Normalize output to float32
+        if audio.dtype == np.int16:
+            audio = audio.astype(np.float32) / 32768.0
+        elif audio.dtype == np.float64:
+            audio = audio.astype(np.float32)
+        elif audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+        #print(f"DEBUG: after conversion dtype={audio.dtype}, min={audio.min()}, max={audio.max()}")
+        # Ensure values are in range [-1.0, 1.0]
+        max_val = np.abs(audio).max()
+        if max_val > 0:
+            audio = audio / max_val
+        audio = np.clip(audio, -1.0, 1.0)
+        #print(f"DEBUG: after normalization min={audio.min()}, max={audio.max()}")
+        return 24000, audio
+    except Exception as e:
+        print(f"Error in synthesize_speech: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+# Load TTS model (patched to work with transformers 5.0.0)
 repo_id = "ai4bharat/IndicF5"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Device", device)
+model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
 model = model.to(device)
 # Example Data (Multiple Examples)
 # Preload all example audios
 for example in EXAMPLES:
     sample_rate, audio_data = load_audio_from_url(example["audio_url"])
+    # Convert to float32 to avoid gradio warnings
+    if audio_data is not None:
+        if audio_data.dtype == np.float64:
+            audio_data = audio_data.astype(np.float32)
+        elif audio_data.dtype == np.int16:
+            audio_data = audio_data.astype(np.float32) / 32768.0
+        audio_data = np.clip(audio_data, -1.0, 1.0)
     example["sample_rate"] = sample_rate
     example["audio_data"] = audio_data
     gr.Markdown(
         """
         # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
         [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
         We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
         IndicF5 supports **11 Indian languages**:
         **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
+            ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio", sources=["microphone", "upload"])
             ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
             submit_btn = gr.Button("🎤 Generate Speech", variant="primary")

requirements.txt CHANGED Viewed

@@ -18,7 +18,7 @@ git+https://github.com/ai4bharat/IndicF5.git
 # torchaudio>=2.0.0
 # torchdiffeq
 # tqdm>=4.65.0
-transformers<4.50
 # transformers_stream_generator
 # vocos
 # wandb

 # torchaudio>=2.0.0
 # torchdiffeq
 # tqdm>=4.65.0
+transformers>=5.0.0
 # transformers_stream_generator
 # vocos
 # wandb