Spaces:

FresherDifference
/

Pocket-TTS

Sleeping

App Files Files Community

FresherDifference commited on 24 days ago

Commit

f48358e

verified ·

1 Parent(s): 4335f64

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -65

app.py CHANGED Viewed

@@ -1,72 +1,101 @@
 import gradio as gr
-import tempfile
-import soundfile as sf
 from pocket_tts import TTSModel
-# -------------------------------------------------
-# Load model ONCE
-# -------------------------------------------------
-model = TTSModel.load_model()
-# -------------------------------------------------
-# HF-safe catalog voices
-# -------------------------------------------------
-VOICES = [
-    "alba",
-    "marius",
-    "javert",
-    "jean",
-    "fantine",
-    "cosette",
-    "eponine",
-    "azelma",
-]
-def generate_tts(text, voice):
     if not text.strip():
-        return None
-    # ✅ Step 1: get model state from catalog voice
-    state = model.get_state_for_voice(voice)
-    # ✅ Step 2: generate audio from state + text
-    audio = model.generate_audio(state, text)
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    sf.write(tmp.name, audio, samplerate=24000)
-    return tmp.name
-with gr.Blocks(title="Pocket TTS (Correct API)") as demo:
-    gr.Markdown(
-        """
-        # 🗣️ Pocket TTS
-        **HF Spaces compatible – catalog voices**
-        """
-    )
-    voice_select = gr.Dropdown(
-        choices=VOICES,
-        value="alba",
-        label="Voice"
-    )
-    text_input = gr.Textbox(
-        label="Text",
-        lines=4,
-        placeholder="Type something to hear it spoken"
-    )
-    generate_btn = gr.Button("Generate")
-    audio_output = gr.Audio(label="Output")
-    generate_btn.click(
-        fn=generate_tts,
-        inputs=[text_input, voice_select],
         outputs=audio_output
     )
-demo.launch()

 import gradio as gr
+import numpy as np
 from pocket_tts import TTSModel
+# 1. Load the model once at startup (Global scope)
+# This prevents reloading the 100M parameters on every click, making it much faster.
+print("Loading Pocket-TTS model...")
+tts = TTSModel.load_model()
+print("Model loaded successfully.")
+# Define some preset voices available in the Kyutai library
+# Note: You can find more voices or exact paths in the kyutai/tts-voices repo
+PRESET_VOICES = {
+    "Alba (American English)": "hf://kyutai/tts-voices/alba-mackenna/casual.wav",
+    "Marius (French Accent)": "hf://kyutai/tts-voices/marius-reynaud/casual.wav",
+    "Jean (Narrator)": "hf://kyutai/tts-voices/jean-dormeuil/casual.wav",
+    "Fantine": "hf://kyutai/tts-voices/fantine-chevallier/casual.wav",
+}
+def generate_speech(text, voice_choice, custom_voice_file):
+    """
+    Generates audio from text using either a preset voice or a custom uploaded file.
+    """
     if not text.strip():
+        raise gr.Error("Please enter some text to generate speech.")
+    # Determine which voice to use
+    voice_path = None
+    # Priority: Custom file > Preset selection
+    if custom_voice_file is not None:
+        print(f"Using custom voice cloning from: {custom_voice_file}")
+        voice_path = custom_voice_file
+    else:
+        print(f"Using preset voice: {voice_choice}")
+        voice_path = PRESET_VOICES.get(voice_choice)
+    if not voice_path:
+        raise gr.Error("Please select a voice or upload a reference audio file.")
+    # 2. Process the voice prompt
+    # This converts the wav file (or HF path) into the conditioning vector
+    try:
+        voice_state = tts.get_state_for_audio_prompt(voice_path)
+    except Exception as e:
+        raise gr.Error(f"Error loading voice: {str(e)}")
+    # 3. Generate Audio
+    # The output is a torch tensor, we need to convert it to numpy for Gradio
+    try:
+        audio_tensor = tts.generate_audio(voice_state, text)
+    except Exception as e:
+        raise gr.Error(f"Generation failed: {str(e)}")
+    # Convert torch tensor to numpy array
+    # pocket-tts usually returns (samples,) shape. Gradio expects (sample_rate, data)
+    audio_numpy = audio_tensor.numpy()
+    # Return tuple (sample_rate, audio_data)
+    return (tts.sample_rate, audio_numpy)
+# 4. Build the Gradio Interface
+with gr.Blocks(title="Pocket-TTS Demo") as demo:
+    gr.Markdown("# 🗣️ Pocket-TTS on CPU")
+    gr.Markdown("A lightweight, 100M parameter text-to-speech model that runs purely on CPU.")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Text to Speak",
+                placeholder="Type something here...",
+                lines=4,
+                value="Pocket TTS is amazing because it runs efficiently on consumer hardware!"
+            )
+            with gr.Accordion("Voice Settings", open=True):
+                voice_dropdown = gr.Dropdown(
+                    choices=list(PRESET_VOICES.keys()),
+                    value="Alba (American English)",
+                    label="Choose a Preset Voice"
+                )
+                gr.Markdown("**OR**")
+                voice_upload = gr.Audio(
+                    label="Clone a Custom Voice (Upload .wav)",
+                    type="filepath"
+                )
+            submit_btn = gr.Button("Generate Audio", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Speech", type="numpy")
+    # Connect the button
+    submit_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, voice_dropdown, voice_upload],
         outputs=audio_output
     )
+# Launch the app
+demo.launch()