D3vShoaib commited on
Commit
ed16331
·
1 Parent(s): fa6c114

optemized for CPU inspired from hadadxyz/pocket-tts-hf-cpu-optimized

Browse files
Files changed (6) hide show
  1. .gitattributes +0 -35
  2. Dockerfile.txt +10 -0
  3. README.md +6 -6
  4. app.py +386 -50
  5. packages.txt +0 -1
  6. requirements.txt +0 -66
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ #
5
+
6
+ FROM hadadrjt/pocket-tts:hf
7
+
8
+ WORKDIR /app
9
+
10
+ COPY app.py .
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Pocket TTS
3
  emoji: ⚡
 
 
4
  colorFrom: green
5
  colorTo: green
6
- sdk: gradio
7
- sdk_version: 6.3.0
8
- app_file: app.py
9
  pinned: true
10
- short_description: A 100M text-to-speech (TTS) by Kyutai-Labs
11
  ---
12
 
13
  # PocketTTS Gradio Interface
@@ -66,4 +66,4 @@ The app supports the following built-in voices:
66
 
67
  Built by [D3vShoaib](https://github.com/D3vShoaib).
68
 
69
- _Note: This is an independent demonstration of the PocketTTS model and is not officially affiliated with Kyutai Labs._
 
1
  ---
2
+ title: Pocket TTS (CPU)
3
  emoji: ⚡
4
+ short_description: A 100M parameters text-to-speech (TTS) model by Kyutai-Labs
5
+ license: apache-2.0
6
  colorFrom: green
7
  colorTo: green
8
+ sdk: docker
9
+ app_port: 7860
 
10
  pinned: true
 
11
  ---
12
 
13
  # PocketTTS Gradio Interface
 
66
 
67
  Built by [D3vShoaib](https://github.com/D3vShoaib).
68
 
69
+ _Note: This is an independent demonstration of the PocketTTS model and is not officially affiliated with Kyutai Labs._
app.py CHANGED
@@ -1,56 +1,266 @@
1
  import gradio as gr
2
  import numpy as np
3
  import os
 
 
 
 
 
 
4
  from huggingface_hub import login
5
  from pocket_tts import TTSModel
6
 
 
 
 
 
7
  # HF Token for gated models in Spaces
8
  hf_token = os.getenv("HF_TOKEN")
9
  if hf_token:
10
  print("HF_TOKEN found, logging in...")
11
  login(token=hf_token)
12
 
13
- # Load model once at startup
14
- print("Loading PocketTTS model...")
15
- model = TTSModel.load_model()
16
- print("Model loaded.")
17
-
18
  VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
19
 
20
- import traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- def generate_speech(text, voice_mode, voice_dropdown, voice_upload):
23
- if not text:
24
- return None
25
-
26
  try:
27
- if voice_mode == "Kyutai Voices":
28
- voice_path = voice_dropdown
 
 
 
 
 
 
 
 
 
 
 
29
  else:
30
- if not voice_upload:
31
- raise gr.Error("Please upload an audio file for voice cloning.")
32
- voice_path = voice_upload
33
-
34
- print(f"Generating with voice: {voice_path}")
35
- try:
36
- voice_state = model.get_state_for_audio_prompt(voice_path)
37
- audio = model.generate_audio(voice_state, text)
38
- except Exception as e:
39
- full_error = traceback.format_exc()
40
- print(f"Error in model processing: {full_error}")
41
- raise gr.Error(f"Model error: {str(e)}")
42
-
43
- # Convert to 16-bit PCM to avoid Gradio warnings
44
- audio_np = audio.cpu().numpy()
45
- audio_int16 = (audio_np * 32767).astype(np.int16)
46
 
47
- return (model.sample_rate, audio_int16)
 
 
 
 
 
 
 
 
 
 
 
 
48
  except gr.Error:
49
  raise
50
  except Exception as e:
51
  full_error = traceback.format_exc()
52
  print(f"Unexpected error: {full_error}")
53
  raise gr.Error(f"An unexpected error occurred: {str(e)}")
 
 
 
 
54
 
55
 
56
  # Load custom theme with fallback
@@ -216,7 +426,8 @@ with gr.Blocks() as demo:
216
  text_input = gr.Textbox(
217
  label="Text to Speak",
218
  placeholder="Enter text here...",
219
- lines=8,
 
220
  elem_id="text-input"
221
  )
222
  voice_mode = gr.Radio(
@@ -240,9 +451,64 @@ with gr.Blocks() as demo:
240
  type="filepath",
241
  elem_id="voice-upload"
242
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  with gr.Row():
244
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
245
  generate_btn = gr.Button("⚡ Generate", variant="primary")
 
246
 
247
  with gr.Column(scale=1):
248
  audio_output = gr.Audio(
@@ -250,22 +516,19 @@ with gr.Blocks() as demo:
250
  autoplay=True,
251
  elem_id="audio-output"
252
  )
253
- gr.Markdown("""
254
- ### 🚀 Performance
255
- - **Latency**: ~200ms first chunk (local install)
256
- - **Speed**: 6x real-time
257
- - **Engine**: CPU Optimized
258
- - **Note**: Demo limited by Gradio hosting
259
- """)
260
-
261
- gr.Examples(
262
- examples=[
263
- ["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "Kyutai Voices", "alba", None],
264
- ["The quick brown fox jumps over the lazy dog.", "Kyutai Voices", "marius", None],
265
- ["Would you like some tea? It's freshly brewed.", "Kyutai Voices", "javert", None]
266
- ],
267
- inputs=[text_input, voice_mode, voice_select, voice_upload],
268
- )
269
 
270
  gr.HTML("""
271
  <div class="disclaimer">
@@ -301,22 +564,95 @@ with gr.Blocks() as demo:
301
  outputs=[standard_voice_col, cloning_voice_col]
302
  )
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  # Event handlers
305
  generate_btn.click(
 
 
 
306
  fn=generate_speech,
307
- inputs=[text_input, voice_mode, voice_select, voice_upload],
308
  outputs=audio_output
 
 
 
309
  )
310
 
311
  text_input.submit(
 
 
 
312
  fn=generate_speech,
313
- inputs=[text_input, voice_mode, voice_select, voice_upload],
314
  outputs=audio_output
 
 
 
 
 
 
 
 
 
315
  )
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  clear_btn.click(
318
- fn=lambda: ("", "Kyutai Voices", "alba", None, None),
319
- outputs=[text_input, voice_mode, voice_select, voice_upload, audio_output]
 
 
 
 
 
 
 
 
 
 
 
 
320
  )
321
 
322
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import numpy as np
3
  import os
4
+ import time
5
+ import torch
6
+ import tempfile
7
+ import threading
8
+ import scipy.io.wavfile
9
+ import traceback
10
  from huggingface_hub import login
11
  from pocket_tts import TTSModel
12
 
13
+ # Configure PyTorch threading behavior for CPU optimization
14
+ torch.set_num_threads(1)
15
+ torch.set_num_interop_threads(1)
16
+
17
  # HF Token for gated models in Spaces
18
  hf_token = os.getenv("HF_TOKEN")
19
  if hf_token:
20
  print("HF_TOKEN found, logging in...")
21
  login(token=hf_token)
22
 
 
 
 
 
 
23
  VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
24
 
25
+ # Default configuration values
26
+ DEFAULT_VOICE = "alba"
27
+ DEFAULT_MODEL_VARIANT = "b6369a24"
28
+ DEFAULT_TEMPERATURE = 0.8
29
+ DEFAULT_LSD_DECODE_STEPS = 1
30
+ DEFAULT_EOS_THRESHOLD = -4.0
31
+ DEFAULT_NOISE_CLAMP = 0.0
32
+ DEFAULT_FRAMES_AFTER_EOS = 10
33
+ MAXIMUM_INPUT_LENGTH = 1000
34
+ TEMPORARY_FILE_LIFETIME_SECONDS = 7200 # 2 hours
35
+
36
+ generation_state_lock = threading.Lock()
37
+ is_currently_generating = False
38
+ stop_generation_requested = False
39
+
40
+ temporary_files_registry = {}
41
+ temporary_files_lock = threading.Lock()
42
+
43
+
44
+ class TextToSpeechManager:
45
+ """
46
+ Manages TTS model lifecycle and speech generation operations.
47
+ Implements lazy loading and caching strategies for performance.
48
+ """
49
+
50
+ def __init__(self):
51
+ self.loaded_model = None
52
+ self.current_configuration = {}
53
+ self.voice_state_cache = {}
54
+
55
+ def load_or_get_model(
56
+ self,
57
+ model_variant,
58
+ temperature,
59
+ lsd_decode_steps,
60
+ noise_clamp,
61
+ eos_threshold
62
+ ):
63
+ """Load a TTS model or return cached instance if configuration matches."""
64
+ processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
65
+ processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
66
+ processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS
67
+ processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None
68
+ processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD
69
+
70
+ requested_configuration = {
71
+ "variant": processed_variant,
72
+ "temp": processed_temperature,
73
+ "lsd_decode_steps": processed_lsd_steps,
74
+ "noise_clamp": processed_noise_clamp,
75
+ "eos_threshold": processed_eos_threshold
76
+ }
77
+
78
+ if self.loaded_model is None or self.current_configuration != requested_configuration:
79
+ print(f"Loading model with config: {requested_configuration}")
80
+ self.loaded_model = TTSModel.load_model(**requested_configuration)
81
+ self.current_configuration = requested_configuration
82
+ self.voice_state_cache = {}
83
+ print("Model loaded.")
84
+
85
+ return self.loaded_model
86
+
87
+ def get_voice_state_for_preset(self, voice_name):
88
+ """Get or compute voice state for a preset voice with caching."""
89
+ validated_voice = voice_name if voice_name in VOICES else DEFAULT_VOICE
90
+
91
+ if validated_voice not in self.voice_state_cache:
92
+ self.voice_state_cache[validated_voice] = self.loaded_model.get_state_for_audio_prompt(
93
+ audio_conditioning=validated_voice,
94
+ truncate=False
95
+ )
96
+
97
+ return self.voice_state_cache[validated_voice]
98
+
99
+ def get_voice_state_for_clone(self, audio_file_path):
100
+ """Compute voice state from uploaded audio file for voice cloning."""
101
+ return self.loaded_model.get_state_for_audio_prompt(
102
+ audio_conditioning=audio_file_path,
103
+ truncate=False
104
+ )
105
+
106
+ def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames):
107
+ """Generate speech audio from text using the specified voice state."""
108
+ processed_frames = int(frames_after_eos) if enable_custom_frames else None
109
+
110
+ return self.loaded_model.generate_audio(
111
+ model_state=voice_state,
112
+ text_to_generate=text_content,
113
+ frames_after_eos=processed_frames,
114
+ copy_state=True
115
+ )
116
+
117
+ def save_audio_to_file(self, audio_tensor):
118
+ """Save generated audio tensor to a temporary WAV file."""
119
+ audio_numpy_data = audio_tensor.numpy()
120
+ audio_sample_rate = self.loaded_model.sample_rate
121
+
122
+ output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
123
+ scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data)
124
+
125
+ with temporary_files_lock:
126
+ temporary_files_registry[output_file.name] = time.time()
127
+
128
+ return output_file.name
129
+
130
+
131
+ # Create global TTS manager instance
132
+ tts_manager = TextToSpeechManager()
133
+
134
+ # Load model at startup with default parameters
135
+ print("Loading PocketTTS model with default parameters...")
136
+ tts_manager.load_or_get_model(
137
+ DEFAULT_MODEL_VARIANT,
138
+ DEFAULT_TEMPERATURE,
139
+ DEFAULT_LSD_DECODE_STEPS,
140
+ DEFAULT_NOISE_CLAMP,
141
+ DEFAULT_EOS_THRESHOLD
142
+ )
143
+ print("Model ready!")
144
+
145
+ def cleanup_expired_temporary_files():
146
+ """Remove temporary files that have exceeded their lifetime."""
147
+ current_timestamp = time.time()
148
+ expired_files = []
149
+
150
+ with temporary_files_lock:
151
+ for file_path, creation_timestamp in list(temporary_files_registry.items()):
152
+ if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
153
+ expired_files.append(file_path)
154
+
155
+ for file_path in expired_files:
156
+ try:
157
+ if os.path.exists(file_path):
158
+ os.remove(file_path)
159
+ del temporary_files_registry[file_path]
160
+ except Exception:
161
+ pass
162
+
163
+
164
+ def validate_text_input(text_content):
165
+ """Validate and clean text input for speech generation."""
166
+ if not text_content or not isinstance(text_content, str):
167
+ return False, ""
168
+
169
+ cleaned_text = text_content.strip()
170
+
171
+ if not cleaned_text:
172
+ return False, ""
173
+
174
+ if len(cleaned_text) > MAXIMUM_INPUT_LENGTH:
175
+ return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters."
176
+
177
+ return True, cleaned_text
178
+
179
+
180
+ def request_generation_stop():
181
+ """Signal a request to stop the current generation."""
182
+ global stop_generation_requested
183
+ stop_generation_requested = True
184
+ return gr.update(interactive=False)
185
+
186
+ # Speech generation function
187
+ def generate_speech(
188
+ text,
189
+ voice_mode,
190
+ voice_dropdown,
191
+ voice_upload,
192
+ temperature,
193
+ lsd_decode_steps,
194
+ noise_clamp,
195
+ eos_threshold,
196
+ frames_after_eos,
197
+ enable_custom_frames
198
+ ):
199
+ """Perform the complete speech generation workflow with thread safety."""
200
+ global is_currently_generating, stop_generation_requested
201
+
202
+ cleanup_expired_temporary_files()
203
+
204
+ is_valid, validation_result = validate_text_input(text)
205
+ if not is_valid:
206
+ if validation_result:
207
+ raise gr.Error(validation_result)
208
+ raise gr.Error("Please enter valid text to generate speech.")
209
+
210
+ if voice_mode == "Voice Cloning" and not voice_upload:
211
+ raise gr.Error("Please upload an audio file for voice cloning.")
212
+
213
+ with generation_state_lock:
214
+ if is_currently_generating:
215
+ raise gr.Error("A generation is already in progress. Please wait.")
216
+ is_currently_generating = True
217
+ stop_generation_requested = False
218
 
 
 
 
 
219
  try:
220
+ tts_manager.load_or_get_model(
221
+ DEFAULT_MODEL_VARIANT,
222
+ temperature,
223
+ lsd_decode_steps,
224
+ noise_clamp,
225
+ eos_threshold
226
+ )
227
+
228
+ if stop_generation_requested:
229
+ return None
230
+
231
+ if voice_mode == "Voice Cloning":
232
+ voice_state = tts_manager.get_voice_state_for_clone(voice_upload)
233
  else:
234
+ voice_state = tts_manager.get_voice_state_for_preset(voice_dropdown)
235
+
236
+ if stop_generation_requested:
237
+ return None
238
+
239
+ print(f"Generating with voice mode: {voice_mode}, temp: {temperature}, lsd_steps: {lsd_decode_steps}")
 
 
 
 
 
 
 
 
 
 
240
 
241
+ generated_audio = tts_manager.generate_audio(
242
+ validation_result,
243
+ voice_state,
244
+ frames_after_eos,
245
+ enable_custom_frames
246
+ )
247
+
248
+ if stop_generation_requested:
249
+ return None
250
+
251
+ output_file_path = tts_manager.save_audio_to_file(generated_audio)
252
+ return output_file_path
253
+
254
  except gr.Error:
255
  raise
256
  except Exception as e:
257
  full_error = traceback.format_exc()
258
  print(f"Unexpected error: {full_error}")
259
  raise gr.Error(f"An unexpected error occurred: {str(e)}")
260
+ finally:
261
+ with generation_state_lock:
262
+ is_currently_generating = False
263
+ stop_generation_requested = False
264
 
265
 
266
  # Load custom theme with fallback
 
426
  text_input = gr.Textbox(
427
  label="Text to Speak",
428
  placeholder="Enter text here...",
429
+ value="Hello! Welcome to Pocket TTS. This lightweight text to speech model runs entirely on your CPU. Try changing the voice or adjusting the generation parameters below.",
430
+ lines=9,
431
  elem_id="text-input"
432
  )
433
  voice_mode = gr.Radio(
 
451
  type="filepath",
452
  elem_id="voice-upload"
453
  )
454
+
455
+ # Generation Parameters Accordion
456
+ with gr.Accordion("⚙️ Generation Parameters", open=False):
457
+ with gr.Row():
458
+ temperature_slider = gr.Slider(
459
+ label="Temperature",
460
+ minimum=0.1,
461
+ maximum=2.0,
462
+ step=0.05,
463
+ value=DEFAULT_TEMPERATURE,
464
+ info="Higher values produce more expressive speech"
465
+ )
466
+ lsd_decode_steps_slider = gr.Slider(
467
+ label="LSD Decode Steps",
468
+ minimum=1,
469
+ maximum=20,
470
+ step=1,
471
+ value=DEFAULT_LSD_DECODE_STEPS,
472
+ info="More steps may improve quality but slower"
473
+ )
474
+
475
+ with gr.Row():
476
+ noise_clamp_slider = gr.Slider(
477
+ label="Noise Clamp",
478
+ minimum=0.0,
479
+ maximum=2.0,
480
+ step=0.05,
481
+ value=DEFAULT_NOISE_CLAMP,
482
+ info="Maximum noise sampling value (0 = disabled)"
483
+ )
484
+ eos_threshold_slider = gr.Slider(
485
+ label="End of Sequence Threshold",
486
+ minimum=-10.0,
487
+ maximum=0.0,
488
+ step=0.25,
489
+ value=DEFAULT_EOS_THRESHOLD,
490
+ info="Smaller values cause earlier completion"
491
+ )
492
+
493
+ with gr.Row():
494
+ enable_custom_frames_checkbox = gr.Checkbox(
495
+ label="Enable Custom Frames After EOS",
496
+ value=False,
497
+ info="Manually control post-EOS frame generation"
498
+ )
499
+ frames_after_eos_slider = gr.Slider(
500
+ label="Frames After EOS",
501
+ minimum=0,
502
+ maximum=100,
503
+ step=1,
504
+ value=DEFAULT_FRAMES_AFTER_EOS,
505
+ info="Additional frames after end-of-sequence (80ms per frame)"
506
+ )
507
+
508
  with gr.Row():
509
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
510
  generate_btn = gr.Button("⚡ Generate", variant="primary")
511
+ stop_btn = gr.Button("🔴 Stop", variant="stop", visible=False)
512
 
513
  with gr.Column(scale=1):
514
  audio_output = gr.Audio(
 
516
  autoplay=True,
517
  elem_id="audio-output"
518
  )
519
+ gr.Examples(
520
+ examples=[
521
+ ["On Tuesday, the seventeenth of October, two thousand twenty-five, at exactly six forty-five in the morning, the outdoor temperature dropped to twelve point eight degrees Celsius. The forecast predicts a high of twenty-two degrees by noon.", "alba"],
522
+ ["Welcome to Station Forty-Seven. Your train to Platform Nineteen B will arrive in approximately fifteen minutes. Please have your tickets ready for inspection.", "marius"],
523
+ ["You dare defy me? I have spent twenty long years hunting you across every shadow and every corner of this wretched kingdom. There is no escape. Justice will find you, and when it does, you will kneel before me and beg for mercy that will never come!", "javert"],
524
+ ["Flight Seven Ninety-Two to London Heathrow is now boarding at Gate Twenty-Three A. Final call for passengers Smith and Johnson. Departure is scheduled for fourteen thirty hours.", "jean"],
525
+ ["Our quarterly revenue reached four point seven million dollars, up eighteen percent from last year. The board meeting is scheduled for the twenty-fifth of November at two fifteen in the afternoon.", "fantine"],
526
+ ["The recipe calls for three hundred fifty grams of flour, two hundred milliliters of milk, and one point five teaspoons of vanilla extract. Bake at one hundred eighty degrees for forty-five minutes.", "cosette"],
527
+ ["Chapter Fourteen, Page Two Hundred Thirty-Seven. The mysterious traveler arrived at the inn precisely at midnight. He carried nothing but a worn leather satchel and spoke with an accent no one could place.", "eponine"],
528
+ ["Exercise routine: Run five kilometers in under thirty minutes. Complete three sets of fifteen push-ups. Rest for ninety seconds between each set. Cool down with ten minutes of stretching.", "azelma"]
529
+ ],
530
+ inputs=[text_input, voice_select],
531
+ )
 
 
 
532
 
533
  gr.HTML("""
534
  <div class="disclaimer">
 
564
  outputs=[standard_voice_col, cloning_voice_col]
565
  )
566
 
567
+ # Define generation inputs list
568
+ generation_inputs = [
569
+ text_input,
570
+ voice_mode,
571
+ voice_select,
572
+ voice_upload,
573
+ temperature_slider,
574
+ lsd_decode_steps_slider,
575
+ noise_clamp_slider,
576
+ eos_threshold_slider,
577
+ frames_after_eos_slider,
578
+ enable_custom_frames_checkbox
579
+ ]
580
+
581
+ # UI state management functions
582
+ def switch_to_generating_state():
583
+ return (
584
+ gr.update(visible=False), # Hide generate button
585
+ gr.update(visible=True, interactive=True) # Show stop button
586
+ )
587
+
588
+ def switch_to_idle_state():
589
+ return (
590
+ gr.update(visible=True), # Show generate button
591
+ gr.update(visible=False) # Hide stop button
592
+ )
593
+
594
  # Event handlers
595
  generate_btn.click(
596
+ fn=switch_to_generating_state,
597
+ outputs=[generate_btn, stop_btn]
598
+ ).then(
599
  fn=generate_speech,
600
+ inputs=generation_inputs,
601
  outputs=audio_output
602
+ ).then(
603
+ fn=switch_to_idle_state,
604
+ outputs=[generate_btn, stop_btn]
605
  )
606
 
607
  text_input.submit(
608
+ fn=switch_to_generating_state,
609
+ outputs=[generate_btn, stop_btn]
610
+ ).then(
611
  fn=generate_speech,
612
+ inputs=generation_inputs,
613
  outputs=audio_output
614
+ ).then(
615
+ fn=switch_to_idle_state,
616
+ outputs=[generate_btn, stop_btn]
617
+ )
618
+
619
+ # Stop button handler
620
+ stop_btn.click(
621
+ fn=request_generation_stop,
622
+ outputs=[stop_btn]
623
  )
624
 
625
+ # Clear button handler - also reset generation parameters
626
+ def perform_clear_action():
627
+ return (
628
+ "", # text_input
629
+ "Kyutai Voices", # voice_mode
630
+ "alba", # voice_select
631
+ None, # voice_upload
632
+ None, # audio_output
633
+ DEFAULT_TEMPERATURE, # temperature_slider
634
+ DEFAULT_LSD_DECODE_STEPS, # lsd_decode_steps_slider
635
+ DEFAULT_NOISE_CLAMP, # noise_clamp_slider
636
+ DEFAULT_EOS_THRESHOLD, # eos_threshold_slider
637
+ DEFAULT_FRAMES_AFTER_EOS, # frames_after_eos_slider
638
+ False # enable_custom_frames_checkbox
639
+ )
640
+
641
  clear_btn.click(
642
+ fn=perform_clear_action,
643
+ outputs=[
644
+ text_input,
645
+ voice_mode,
646
+ voice_select,
647
+ voice_upload,
648
+ audio_output,
649
+ temperature_slider,
650
+ lsd_decode_steps_slider,
651
+ noise_clamp_slider,
652
+ eos_threshold_slider,
653
+ frames_after_eos_slider,
654
+ enable_custom_frames_checkbox
655
+ ]
656
  )
657
 
658
  if __name__ == "__main__":
packages.txt DELETED
@@ -1 +0,0 @@
1
- ffmpeg
 
 
requirements.txt DELETED
@@ -1,66 +0,0 @@
1
- aiofiles==24.1.0
2
- annotated-doc==0.0.4
3
- annotated-types==0.7.0
4
- anyio==4.12.1
5
- beartype==0.22.9
6
- brotli==1.2.0
7
- certifi==2026.1.4
8
- charset-normalizer==3.4.4
9
- click==8.3.1
10
- colorama==0.4.6
11
- einops==0.8.1
12
- exceptiongroup==1.3.1
13
- fastapi==0.128.0
14
- ffmpy==1.0.0
15
- filelock==3.20.3
16
- fsspec==2026.1.0
17
- gradio==6.3.0
18
- gradio_client==2.0.3
19
- groovy==0.1.2
20
- h11==0.16.0
21
- hf-xet==1.2.0
22
- httpcore==1.0.9
23
- httpx==0.28.1
24
- huggingface_hub==1.3.1
25
- idna==3.11
26
- Jinja2==3.1.6
27
- markdown-it-py==4.0.0
28
- MarkupSafe==3.0.3
29
- mdurl==0.1.2
30
- mpmath==1.3.0
31
- networkx==3.4.2
32
- numpy==2.2.6
33
- orjson==3.11.5
34
- packaging==25.0
35
- pandas==2.3.3
36
- pillow==12.1.0
37
- pocket-tts==1.0.1
38
- pydantic==2.12.5
39
- pydantic_core==2.41.5
40
- pydub==0.25.1
41
- Pygments==2.19.2
42
- python-dateutil==2.9.0.post0
43
- python-multipart==0.0.21
44
- pytz==2025.2
45
- PyYAML==6.0.3
46
- requests==2.32.5
47
- rich==14.2.0
48
- safehttpx==0.1.7
49
- safetensors==0.7.0
50
- scipy==1.15.3
51
- semantic-version==2.10.0
52
- sentencepiece==0.2.1
53
- shellingham==1.5.4
54
- six==1.17.0
55
- starlette==0.50.0
56
- sympy==1.14.0
57
- tomlkit==0.13.3
58
- torch==2.9.1
59
- tqdm==4.67.1
60
- typer==0.21.1
61
- typer-slim==0.21.1
62
- typing-inspection==0.4.2
63
- typing_extensions==4.15.0
64
- tzdata==2025.3
65
- urllib3==2.6.3
66
- uvicorn==0.40.0