D3vShoaib commited on
Commit
da2a3d2
·
1 Parent(s): ba5e798

added voice-cloning

Browse files
Files changed (2) hide show
  1. .gitignore +18 -0
  2. app.py +74 -17
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environments
7
+ venv/
8
+ .venv/
9
+ env/
10
+ ENV/
11
+
12
+ # Environment variables
13
+ .env
14
+
15
+ # Distribution / packaging
16
+ dist/
17
+ build/
18
+ *.egg-info/
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from pocket_tts import TTSModel
3
 
4
  # Load model once at startup
@@ -8,13 +9,30 @@ print("Model loaded.")
8
 
9
  VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
10
 
11
- def generate_speech(text, voice):
12
  if not text:
13
  return None
14
 
15
- voice_state = model.get_state_for_audio_prompt(voice)
16
- audio = model.generate_audio(voice_state, text)
17
- return (model.sample_rate, audio.cpu().numpy())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Load custom theme with fallback
20
  try:
@@ -131,6 +149,17 @@ footer {visibility: hidden}
131
  padding: 20px;
132
  }
133
  }
 
 
 
 
 
 
 
 
 
 
 
134
  """
135
 
136
  with gr.Blocks() as demo:
@@ -171,12 +200,27 @@ with gr.Blocks() as demo:
171
  lines=8,
172
  elem_id="text-input"
173
  )
174
- voice_select = gr.Dropdown(
175
- choices=VOICES,
176
- value="alba",
177
- label="Select Voice",
178
- elem_id="voice-select"
179
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  with gr.Row():
181
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
182
  generate_btn = gr.Button("⚡ Generate", variant="primary")
@@ -197,11 +241,11 @@ with gr.Blocks() as demo:
197
 
198
  gr.Examples(
199
  examples=[
200
- ["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "alba"],
201
- ["The quick brown fox jumps over the lazy dog.", "marius"],
202
- ["Would you like some tea? It's freshly brewed.", "javert"]
203
  ],
204
- inputs=[text_input, voice_select],
205
  )
206
 
207
  gr.HTML("""
@@ -225,22 +269,35 @@ with gr.Blocks() as demo:
225
  </div>
226
  """)
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  # Event handlers
229
  generate_btn.click(
230
  fn=generate_speech,
231
- inputs=[text_input, voice_select],
232
  outputs=audio_output
233
  )
234
 
235
  text_input.submit(
236
  fn=generate_speech,
237
- inputs=[text_input, voice_select],
238
  outputs=audio_output
239
  )
240
 
241
  clear_btn.click(
242
- fn=lambda: ("", "alba", None),
243
- outputs=[text_input, voice_select, audio_output]
244
  )
245
 
246
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ import numpy as np
3
  from pocket_tts import TTSModel
4
 
5
  # Load model once at startup
 
9
 
10
  VOICES = ['alba', 'marius', 'javert', 'jean', 'fantine', 'cosette', 'eponine', 'azelma']
11
 
12
+ def generate_speech(text, voice_mode, voice_dropdown, voice_upload):
13
  if not text:
14
  return None
15
 
16
+ try:
17
+ if voice_mode == "Kyutai Voices":
18
+ voice_path = voice_dropdown
19
+ else:
20
+ if not voice_upload:
21
+ return None
22
+ voice_path = voice_upload
23
+
24
+ print(f"Generating with voice: {voice_path}")
25
+ voice_state = model.get_state_for_audio_prompt(voice_path)
26
+ audio = model.generate_audio(voice_state, text)
27
+
28
+ # Convert to 16-bit PCM to avoid Gradio warnings
29
+ audio_np = audio.cpu().numpy()
30
+ audio_int16 = (audio_np * 32767).astype(np.int16)
31
+
32
+ return (model.sample_rate, audio_int16)
33
+ except Exception as e:
34
+ print(f"Error generating speech: {e}")
35
+ return None
36
 
37
  # Load custom theme with fallback
38
  try:
 
149
  padding: 20px;
150
  }
151
  }
152
+ #voice-mode .wrap {
153
+ display: flex !important;
154
+ flex-direction: row !important;
155
+ width: 100% !important;
156
+ }
157
+
158
+ #voice-mode .wrap label {
159
+ flex: 1 !important;
160
+ justify-content: center !important;
161
+ text-align: center !important;
162
+ }
163
  """
164
 
165
  with gr.Blocks() as demo:
 
200
  lines=8,
201
  elem_id="text-input"
202
  )
203
+ voice_mode = gr.Radio(
204
+ choices=["Kyutai Voices", "Voice Cloning"],
205
+ value="Kyutai Voices",
206
+ label="Voice Mode",
207
+ elem_id="voice-mode"
208
  )
209
+
210
+ with gr.Column(visible=True) as standard_voice_col:
211
+ voice_select = gr.Dropdown(
212
+ choices=VOICES,
213
+ value="alba",
214
+ label="Select from Kyutai Voices",
215
+ elem_id="voice-select"
216
+ )
217
+
218
+ with gr.Column(visible=False) as cloning_voice_col:
219
+ voice_upload = gr.Audio(
220
+ label="Upload Voice for Cloning (WAV/MP3)",
221
+ type="filepath",
222
+ elem_id="voice-upload"
223
+ )
224
  with gr.Row():
225
  clear_btn = gr.Button("🗑️ Clear", variant="secondary")
226
  generate_btn = gr.Button("⚡ Generate", variant="primary")
 
241
 
242
  gr.Examples(
243
  examples=[
244
+ ["Hello! This is a test of the pocket-tts system. It's incredibly fast and runs right on your CPU.", "Kyutai Voices", "alba", None],
245
+ ["The quick brown fox jumps over the lazy dog.", "Kyutai Voices", "marius", None],
246
+ ["Would you like some tea? It's freshly brewed.", "Kyutai Voices", "javert", None]
247
  ],
248
+ inputs=[text_input, voice_mode, voice_select, voice_upload],
249
  )
250
 
251
  gr.HTML("""
 
269
  </div>
270
  """)
271
 
272
+ # Visibility Toggling
273
+ def update_voice_ui(mode):
274
+ if mode == "Kyutai Voices":
275
+ return gr.update(visible=True), gr.update(visible=False)
276
+ else:
277
+ return gr.update(visible=False), gr.update(visible=True)
278
+
279
+ voice_mode.change(
280
+ fn=update_voice_ui,
281
+ inputs=[voice_mode],
282
+ outputs=[standard_voice_col, cloning_voice_col]
283
+ )
284
+
285
  # Event handlers
286
  generate_btn.click(
287
  fn=generate_speech,
288
+ inputs=[text_input, voice_mode, voice_select, voice_upload],
289
  outputs=audio_output
290
  )
291
 
292
  text_input.submit(
293
  fn=generate_speech,
294
+ inputs=[text_input, voice_mode, voice_select, voice_upload],
295
  outputs=audio_output
296
  )
297
 
298
  clear_btn.click(
299
+ fn=lambda: ("", "Kyutai Voices", "alba", None, None),
300
+ outputs=[text_input, voice_mode, voice_select, voice_upload, audio_output]
301
  )
302
 
303
  if __name__ == "__main__":