Spaces:

FresherDifference
/

Pocket-TTS

Sleeping

App Files Files Community

FresherDifference commited on Jan 15

Commit

909f184

verified ·

1 Parent(s): 0513ad4

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -51

app.py CHANGED Viewed

@@ -1,101 +1,120 @@
 import gradio as gr
 import numpy as np
 from pocket_tts import TTSModel
-# 1. Load the model once at startup (Global scope)
-# This prevents reloading the 100M parameters on every click, making it much faster.
 print("Loading Pocket-TTS model...")
 tts = TTSModel.load_model()
 print("Model loaded successfully.")
-# Define some preset voices available in the Kyutai library
-# Note: You can find more voices or exact paths in the kyutai/tts-voices repo
 PRESET_VOICES = {
-    "Alba (American English)": "hf://kyutai/tts-voices/alba-mackenna/casual.wav",
-    "Marius (French Accent)": "hf://kyutai/tts-voices/marius-reynaud/casual.wav",
-    "Jean (Narrator)": "hf://kyutai/tts-voices/jean-dormeuil/casual.wav",
-    "Fantine": "hf://kyutai/tts-voices/fantine-chevallier/casual.wav",
 }
-def generate_speech(text, voice_choice, custom_voice_file):
     """
-    Generates audio from text using either a preset voice or a custom uploaded file.
     """
     if not text.strip():
-        raise gr.Error("Please enter some text to generate speech.")
-    # Determine which voice to use
-    voice_path = None
-    # Priority: Custom file > Preset selection
-    if custom_voice_file is not None:
-        print(f"Using custom voice cloning from: {custom_voice_file}")
-        voice_path = custom_voice_file
-    else:
-        print(f"Using preset voice: {voice_choice}")
-        voice_path = PRESET_VOICES.get(voice_choice)
-    if not voice_path:
-        raise gr.Error("Please select a voice or upload a reference audio file.")
-    # 2. Process the voice prompt
-    # This converts the wav file (or HF path) into the conditioning vector
-    try:
-        voice_state = tts.get_state_for_audio_prompt(voice_path)
-    except Exception as e:
-        raise gr.Error(f"Error loading voice: {str(e)}")
-    # 3. Generate Audio
-    # The output is a torch tensor, we need to convert it to numpy for Gradio
-    try:
-        audio_tensor = tts.generate_audio(voice_state, text)
     except Exception as e:
-        raise gr.Error(f"Generation failed: {str(e)}")
-    # Convert torch tensor to numpy array
-    # pocket-tts usually returns (samples,) shape. Gradio expects (sample_rate, data)
-    audio_numpy = audio_tensor.numpy()
-    # Return tuple (sample_rate, audio_data)
-    return (tts.sample_rate, audio_numpy)
-# 4. Build the Gradio Interface
 with gr.Blocks(title="Pocket-TTS Demo") as demo:
-    gr.Markdown("# 🗣️ Pocket-TTS on CPU")
-    gr.Markdown("A lightweight, 100M parameter text-to-speech model that runs purely on CPU.")
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
                 label="Text to Speak",
-                placeholder="Type something here...",
                 lines=4,
-                value="Pocket TTS is amazing because it runs efficiently on consumer hardware!"
             )
             with gr.Accordion("Voice Settings", open=True):
                 voice_dropdown = gr.Dropdown(
                     choices=list(PRESET_VOICES.keys()),
                     value="Alba (American English)",
-                    label="Choose a Preset Voice"
                 )
                 gr.Markdown("**OR**")
                 voice_upload = gr.Audio(
-                    label="Clone a Custom Voice (Upload .wav)",
                     type="filepath"
                 )
             submit_btn = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
-            audio_output = gr.Audio(label="Generated Speech", type="numpy")
-    # Connect the button
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, voice_dropdown, voice_upload],
         outputs=audio_output
     )
-# Launch the app
 demo.launch()

 import gradio as gr
 import numpy as np
+import os
+from pydub import AudioSegment
 from pocket_tts import TTSModel
+# 1. Load the model
 print("Loading Pocket-TTS model...")
+# Ensure you have HF_TOKEN in your Space Secrets for cloning to work
 tts = TTSModel.load_model()
 print("Model loaded successfully.")
+# 2. Define Presets (Simple Strings Only)
+# We map the display name to the internal ID string.
+# We do NOT use URLs here to avoid 404 errors.
 PRESET_VOICES = {
+    "Alba (American English)": "alba",
+    "Marius (French)": "marius",
+    "Jean (Narrator)": "jean",
+    "Fantine": "fantine",
+    "Javert": "javert",
+    "Cosette": "cosette",
+    "Eponine": "eponine",
+    "Azelma": "azelma",
 }
+def preprocess_audio(filepath):
     """
+    Takes any audio file (MP3, M4A, WAV), ensures it is
+    a valid 16-bit PCM WAV compatible with the model.
     """
+    try:
+        print(f"Converting file: {filepath}")
+        audio = AudioSegment.from_file(filepath)
+        # Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit
+        # We enforce standard wav settings here to prevent "RIFF id" errors
+        audio = audio.set_channels(1).set_sample_width(2)
+        output_path = filepath + "_fixed.wav"
+        audio.export(output_path, format="wav")
+        print(f"Converted to: {output_path}")
+        return output_path
+    except Exception as e:
+        raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}")
+def generate_speech(text, voice_choice, custom_voice_file):
     if not text.strip():
+        raise gr.Error("Please enter some text.")
+    try:
+        # LOGIC BRANCH 1: Custom Voice Upload
+        if custom_voice_file is not None:
+            print("--- Mode: Voice Cloning ---")
+            # 1. Fix the audio file (Fixes 'RIFF id' error)
+            clean_wav_path = preprocess_audio(custom_voice_file)
+            # 2. Extract the speaker style
+            # The model analyzes the WAV to clone the voice
+            voice_state = tts.get_state_for_audio_prompt(clean_wav_path)
+            # 3. Generate
+            audio_tensor = tts.generate_audio(voice_state, text)
+        # LOGIC BRANCH 2: Built-in Preset
+        else:
+            print("--- Mode: Preset Voice ---")
+            voice_id = PRESET_VOICES[voice_choice]
+            print(f"Using Internal ID: {voice_id}")
+            # We pass the STRING directly.
+            # We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error)
+            audio_tensor = tts.generate_audio(voice_id, text)
+        return (tts.sample_rate, audio_tensor.numpy())
     except Exception as e:
+        # Print full error to logs for debugging
+        import traceback
+        traceback.print_exc()
+        raise gr.Error(f"Generation Error: {str(e)}")
+# 3. Build Interface
 with gr.Blocks(title="Pocket-TTS Demo") as demo:
+    gr.Markdown("# 🗣️ Pocket-TTS (Fixed)")
+    gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.")
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
                 label="Text to Speak",
                 lines=4,
+                value="This is a test of the pocket text to speech system."
             )
             with gr.Accordion("Voice Settings", open=True):
                 voice_dropdown = gr.Dropdown(
                     choices=list(PRESET_VOICES.keys()),
                     value="Alba (American English)",
+                    label="Use a Preset Voice"
                 )
                 gr.Markdown("**OR**")
                 voice_upload = gr.Audio(
+                    label="Clone a Voice (Upload any audio)",
                     type="filepath"
                 )
             submit_btn = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
+            audio_output = gr.Audio(label="Result", type="numpy")
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, voice_dropdown, voice_upload],
         outputs=audio_output
     )
 demo.launch()