added voice-cloning
Browse files- .gitignore +18 -0
- app.py +74 -17
.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Virtual environments
|
| 7 |
+
venv/
|
| 8 |
+
.venv/
|
| 9 |
+
env/
|
| 10 |
+
ENV/
|
| 11 |
+
|
| 12 |
+
# Environment variables
|
| 13 |
+
.env
|
| 14 |
+
|
| 15 |
+
# Distribution / packaging
|
| 16 |
+
dist/
|
| 17 |
+
build/
|
| 18 |
+
*.egg-info/
|
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
from pocket_tts import TTSModel
|
| 3 |
|
| 4 |
# Load model once at startup
|
|
@@ -8,13 +9,30 @@ print("Model loaded.")
|
|
| 8 |
|
| 9 |
VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
|
| 10 |
|
| 11 |
-
def generate_speech(text,
|
| 12 |
if not text:
|
| 13 |
return None
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Load custom theme with fallback
|
| 20 |
try:
|
|
@@ -131,6 +149,17 @@ footer {visibility: hidden}
|
|
| 131 |
padding: 20px;
|
| 132 |
}
|
| 133 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
"""
|
| 135 |
|
| 136 |
with gr.Blocks() as demo:
|
|
@@ -171,12 +200,27 @@ with gr.Blocks() as demo:
|
|
| 171 |
lines=8,
|
| 172 |
elem_id="text-input"
|
| 173 |
)
|
| 174 |
-
|
| 175 |
-
choices=
|
| 176 |
-
value="
|
| 177 |
-
label="
|
| 178 |
-
elem_id="voice-
|
| 179 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
with gr.Row():
|
| 181 |
clear_btn = gr.Button("🗑️ Clear", variant="secondary")
|
| 182 |
generate_btn = gr.Button("⚡ Generate", variant="primary")
|
|
@@ -197,11 +241,11 @@ with gr.Blocks() as demo:
|
|
| 197 |
|
| 198 |
gr.Examples(
|
| 199 |
examples=[
|
| 200 |
-
["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "alba"],
|
| 201 |
-
["The quick brown fox jumps over the lazy dog.", "marius"],
|
| 202 |
-
["Would you like some tea? It's freshly brewed.", "javert"]
|
| 203 |
],
|
| 204 |
-
inputs=[text_input, voice_select],
|
| 205 |
)
|
| 206 |
|
| 207 |
gr.HTML("""
|
|
@@ -225,22 +269,35 @@ with gr.Blocks() as demo:
|
|
| 225 |
</div>
|
| 226 |
""")
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
# Event handlers
|
| 229 |
generate_btn.click(
|
| 230 |
fn=generate_speech,
|
| 231 |
-
inputs=[text_input, voice_select],
|
| 232 |
outputs=audio_output
|
| 233 |
)
|
| 234 |
|
| 235 |
text_input.submit(
|
| 236 |
fn=generate_speech,
|
| 237 |
-
inputs=[text_input, voice_select],
|
| 238 |
outputs=audio_output
|
| 239 |
)
|
| 240 |
|
| 241 |
clear_btn.click(
|
| 242 |
-
fn=lambda: ("", "alba", None),
|
| 243 |
-
outputs=[text_input, voice_select, audio_output]
|
| 244 |
)
|
| 245 |
|
| 246 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import numpy as np
|
| 3 |
from pocket_tts import TTSModel
|
| 4 |
|
| 5 |
# Load model once at startup
|
|
|
|
| 9 |
|
| 10 |
VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
|
| 11 |
|
| 12 |
+
def generate_speech(text, voice_mode, voice_dropdown, voice_upload):
|
| 13 |
if not text:
|
| 14 |
return None
|
| 15 |
|
| 16 |
+
try:
|
| 17 |
+
if voice_mode == "Kyutai Voices":
|
| 18 |
+
voice_path = voice_dropdown
|
| 19 |
+
else:
|
| 20 |
+
if not voice_upload:
|
| 21 |
+
return None
|
| 22 |
+
voice_path = voice_upload
|
| 23 |
+
|
| 24 |
+
print(f"Generating with voice: {voice_path}")
|
| 25 |
+
voice_state = model.get_state_for_audio_prompt(voice_path)
|
| 26 |
+
audio = model.generate_audio(voice_state, text)
|
| 27 |
+
|
| 28 |
+
# Convert to 16-bit PCM to avoid Gradio warnings
|
| 29 |
+
audio_np = audio.cpu().numpy()
|
| 30 |
+
audio_int16 = (audio_np * 32767).astype(np.int16)
|
| 31 |
+
|
| 32 |
+
return (model.sample_rate, audio_int16)
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"Error generating speech: {e}")
|
| 35 |
+
return None
|
| 36 |
|
| 37 |
# Load custom theme with fallback
|
| 38 |
try:
|
|
|
|
| 149 |
padding: 20px;
|
| 150 |
}
|
| 151 |
}
|
| 152 |
+
#voice-mode .wrap {
|
| 153 |
+
display: flex !important;
|
| 154 |
+
flex-direction: row !important;
|
| 155 |
+
width: 100% !important;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
#voice-mode .wrap label {
|
| 159 |
+
flex: 1 !important;
|
| 160 |
+
justify-content: center !important;
|
| 161 |
+
text-align: center !important;
|
| 162 |
+
}
|
| 163 |
"""
|
| 164 |
|
| 165 |
with gr.Blocks() as demo:
|
|
|
|
| 200 |
lines=8,
|
| 201 |
elem_id="text-input"
|
| 202 |
)
|
| 203 |
+
voice_mode = gr.Radio(
|
| 204 |
+
choices=["Kyutai Voices", "Voice Cloning"],
|
| 205 |
+
value="Kyutai Voices",
|
| 206 |
+
label="Voice Mode",
|
| 207 |
+
elem_id="voice-mode"
|
| 208 |
)
|
| 209 |
+
|
| 210 |
+
with gr.Column(visible=True) as standard_voice_col:
|
| 211 |
+
voice_select = gr.Dropdown(
|
| 212 |
+
choices=VOICES,
|
| 213 |
+
value="alba",
|
| 214 |
+
label="Select from Kyutai Voices",
|
| 215 |
+
elem_id="voice-select"
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
with gr.Column(visible=False) as cloning_voice_col:
|
| 219 |
+
voice_upload = gr.Audio(
|
| 220 |
+
label="Upload Voice for Cloning (WAV/MP3)",
|
| 221 |
+
type="filepath",
|
| 222 |
+
elem_id="voice-upload"
|
| 223 |
+
)
|
| 224 |
with gr.Row():
|
| 225 |
clear_btn = gr.Button("🗑️ Clear", variant="secondary")
|
| 226 |
generate_btn = gr.Button("⚡ Generate", variant="primary")
|
|
|
|
| 241 |
|
| 242 |
gr.Examples(
|
| 243 |
examples=[
|
| 244 |
+
["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "Kyutai Voices", "alba", None],
|
| 245 |
+
["The quick brown fox jumps over the lazy dog.", "Kyutai Voices", "marius", None],
|
| 246 |
+
["Would you like some tea? It's freshly brewed.", "Kyutai Voices", "javert", None]
|
| 247 |
],
|
| 248 |
+
inputs=[text_input, voice_mode, voice_select, voice_upload],
|
| 249 |
)
|
| 250 |
|
| 251 |
gr.HTML("""
|
|
|
|
| 269 |
</div>
|
| 270 |
""")
|
| 271 |
|
| 272 |
+
# Visibility Toggling
|
| 273 |
+
def update_voice_ui(mode):
|
| 274 |
+
if mode == "Kyutai Voices":
|
| 275 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 276 |
+
else:
|
| 277 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 278 |
+
|
| 279 |
+
voice_mode.change(
|
| 280 |
+
fn=update_voice_ui,
|
| 281 |
+
inputs=[voice_mode],
|
| 282 |
+
outputs=[standard_voice_col, cloning_voice_col]
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
# Event handlers
|
| 286 |
generate_btn.click(
|
| 287 |
fn=generate_speech,
|
| 288 |
+
inputs=[text_input, voice_mode, voice_select, voice_upload],
|
| 289 |
outputs=audio_output
|
| 290 |
)
|
| 291 |
|
| 292 |
text_input.submit(
|
| 293 |
fn=generate_speech,
|
| 294 |
+
inputs=[text_input, voice_mode, voice_select, voice_upload],
|
| 295 |
outputs=audio_output
|
| 296 |
)
|
| 297 |
|
| 298 |
clear_btn.click(
|
| 299 |
+
fn=lambda: ("", "Kyutai Voices", "alba", None, None),
|
| 300 |
+
outputs=[text_input, voice_mode, voice_select, voice_upload, audio_output]
|
| 301 |
)
|
| 302 |
|
| 303 |
if __name__ == "__main__":
|