tts

Running

App Files Files Community

D3vShoaib commited on Jan 14

Commit

da2a3d2

1 Parent(s): ba5e798

added voice-cloning

Browse files

Files changed (2) hide show

.gitignore +18 -0
app.py +74 -17

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environments
+venv/
+.venv/
+env/
+ENV/
+# Environment variables
+.env
+# Distribution / packaging
+dist/
+build/
+*.egg-info/

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 from pocket_tts import TTSModel
 # Load model once at startup
@@ -8,13 +9,30 @@ print("Model loaded.")
 VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
-def generate_speech(text, voice):
     if not text:
         return None
-    voice_state = model.get_state_for_audio_prompt(voice)
-    audio = model.generate_audio(voice_state, text)
-    return (model.sample_rate, audio.cpu().numpy())
 # Load custom theme with fallback
 try:
@@ -131,6 +149,17 @@ footer {visibility: hidden}
         padding: 20px;
     }
 }
 """
 with gr.Blocks() as demo:
@@ -171,12 +200,27 @@ with gr.Blocks() as demo:
                 lines=8,
                 elem_id="text-input"
             )
-            voice_select = gr.Dropdown(
-                choices=VOICES,
-                value="alba",
-                label="Select Voice",
-                elem_id="voice-select"
             )
             with gr.Row():
                 clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                 generate_btn = gr.Button("⚡ Generate", variant="primary")
@@ -197,11 +241,11 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
-            ["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "alba"],
-            ["The quick brown fox jumps over the lazy dog.", "marius"],
-            ["Would you like some tea? It's freshly brewed.", "javert"]
         ],
-        inputs=[text_input, voice_select],
     )
     gr.HTML("""
@@ -225,22 +269,35 @@ with gr.Blocks() as demo:
         </div>
     """)
     # Event handlers
     generate_btn.click(
         fn=generate_speech,
-        inputs=[text_input, voice_select],
         outputs=audio_output
     )
     text_input.submit(
         fn=generate_speech,
-        inputs=[text_input, voice_select],
         outputs=audio_output
     )
     clear_btn.click(
-        fn=lambda: ("", "alba", None),
-        outputs=[text_input, voice_select, audio_output]
     )
 if __name__ == "__main__":

 import gradio as gr
+import numpy as np
 from pocket_tts import TTSModel
 # Load model once at startup
 VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
+def generate_speech(text, voice_mode, voice_dropdown, voice_upload):
     if not text:
         return None
+    try:
+        if voice_mode == "Kyutai Voices":
+            voice_path = voice_dropdown
+        else:
+            if not voice_upload:
+                return None
+            voice_path = voice_upload
+        print(f"Generating with voice: {voice_path}")
+        voice_state = model.get_state_for_audio_prompt(voice_path)
+        audio = model.generate_audio(voice_state, text)
+        # Convert to 16-bit PCM to avoid Gradio warnings
+        audio_np = audio.cpu().numpy()
+        audio_int16 = (audio_np * 32767).astype(np.int16)
+        return (model.sample_rate, audio_int16)
+    except Exception as e:
+        print(f"Error generating speech: {e}")
+        return None
 # Load custom theme with fallback
 try:
         padding: 20px;
     }
 }
+#voice-mode .wrap {
+    display: flex !important;
+    flex-direction: row !important;
+    width: 100% !important;
+}
+#voice-mode .wrap label {
+    flex: 1 !important;
+    justify-content: center !important;
+    text-align: center !important;
+}
 """
 with gr.Blocks() as demo:
                 lines=8,
                 elem_id="text-input"
             )
+            voice_mode = gr.Radio(
+                choices=["Kyutai Voices", "Voice Cloning"],
+                value="Kyutai Voices",
+                label="Voice Mode",
+                elem_id="voice-mode"
             )
+            with gr.Column(visible=True) as standard_voice_col:
+                voice_select = gr.Dropdown(
+                    choices=VOICES,
+                    value="alba",
+                    label="Select from Kyutai Voices",
+                    elem_id="voice-select"
+                )
+            with gr.Column(visible=False) as cloning_voice_col:
+                voice_upload = gr.Audio(
+                    label="Upload Voice for Cloning (WAV/MP3)",
+                    type="filepath",
+                    elem_id="voice-upload"
+                )
             with gr.Row():
                 clear_btn = gr.Button("🗑️ Clear", variant="secondary")
                 generate_btn = gr.Button("⚡ Generate", variant="primary")
     gr.Examples(
         examples=[
+            ["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "Kyutai Voices", "alba", None],
+            ["The quick brown fox jumps over the lazy dog.", "Kyutai Voices", "marius", None],
+            ["Would you like some tea? It's freshly brewed.", "Kyutai Voices", "javert", None]
         ],
+        inputs=[text_input, voice_mode, voice_select, voice_upload],
     )
     gr.HTML("""
         </div>
     """)
+    # Visibility Toggling
+    def update_voice_ui(mode):
+        if mode == "Kyutai Voices":
+            return gr.update(visible=True), gr.update(visible=False)
+        else:
+            return gr.update(visible=False), gr.update(visible=True)
+    voice_mode.change(
+        fn=update_voice_ui,
+        inputs=[voice_mode],
+        outputs=[standard_voice_col, cloning_voice_col]
+    )
     # Event handlers
     generate_btn.click(
         fn=generate_speech,
+        inputs=[text_input, voice_mode, voice_select, voice_upload],
         outputs=audio_output
     )
     text_input.submit(
         fn=generate_speech,
+        inputs=[text_input, voice_mode, voice_select, voice_upload],
         outputs=audio_output
     )
     clear_btn.click(
+        fn=lambda: ("", "Kyutai Voices", "alba", None, None),
+        outputs=[text_input, voice_mode, voice_select, voice_upload, audio_output]
     )
 if __name__ == "__main__":