notrito commited on
Commit
31ff1bd
·
1 Parent(s): e588eb9

translation

Browse files
Files changed (2) hide show
  1. app.py +120 -119
  2. f5-tts_tests.ipynb +297 -0
app.py CHANGED
@@ -14,7 +14,7 @@ from f5_tts.infer.utils_infer import preprocess_ref_audio_text, convert_char_to_
14
 
15
  # Configuración
16
  MODEL_NAME = "F5-TTS"
17
- SUPPORTED_LANGUAGES = ["es", "en"]
18
  MAX_AUDIO_SIZE = 10 * 1024 * 1024 # 10MB
19
 
20
  # Variables globales para el modelo (se cargan una vez)
@@ -23,27 +23,27 @@ vocoder = None
23
  model_loaded = False
24
 
25
  def load_models():
26
- """Cargar F5-TTS y vocoder (solo una vez al iniciar)"""
27
  global model, vocoder, model_loaded
28
 
29
  if model_loaded:
30
  return True
31
 
32
  try:
33
- print("⏳ Cargando F5-TTS y vocoder...")
34
  print("=" * 50)
35
 
36
- # Cargar vocoder primero
37
- print("📥 Cargando vocoder Vocos...")
38
  vocoder = load_vocoder(
39
  vocoder_name="vocos",
40
  is_local=False,
41
  device="cpu"
42
  )
43
- print("✅ Vocoder cargado correctamente")
44
 
45
- # Configuración del modelo (copiado del código oficial)
46
- print("\n📥 Cargando modelo F5-TTS v1 Base...")
47
 
48
  ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors"))
49
  model_cfg = dict(
@@ -55,76 +55,76 @@ def load_models():
55
  conv_layers=4
56
  )
57
 
58
- # Cargar modelo usando la misma función que el código oficial
59
  model = load_model(
60
  DiT,
61
  model_cfg,
62
  ckpt_path
63
  )
64
- print("✅ Modelo F5-TTS cargado correctamente")
65
 
66
  model_loaded = True
67
  print("\n" + "=" * 50)
68
- print("✅ Todos los modelos cargados exitosamente")
69
  return True
70
 
71
  except Exception as e:
72
- print(f"\n❌ ERROR CRÍTICO cargando modelos:")
73
- print(f" Tipo: {type(e).__name__}")
74
- print(f" Mensaje: {str(e)}")
75
  import traceback
76
- print("\nStack trace completo:")
77
  traceback.print_exc()
78
  print("=" * 50)
79
  return False
80
 
81
  def validate_audio(audio_file):
82
- """Validar archivo de audio"""
83
  if audio_file is None:
84
- return False, "Por favor, sube un archivo de audio"
85
 
86
  try:
87
  file_size = os.path.getsize(audio_file)
88
  if file_size > MAX_AUDIO_SIZE:
89
- return False, f"Archivo muy grande. Máximo 10MB"
90
- return True, "Audio válido"
91
  except Exception as e:
92
- return False, f"Error validando audio: {e}"
93
 
94
  def generate_voice(reference_audio, ref_text, gen_text, language):
95
- """Generar voz con F5-TTS"""
96
 
97
- # Validar entrada
98
  is_valid, msg = validate_audio(reference_audio)
99
  if not is_valid:
100
  return None, f"❌ {msg}", ""
101
 
102
  if not ref_text or not ref_text.strip():
103
- return None, "❌ Debes escribir la transcripción del audio de referencia", ""
104
 
105
  if not gen_text or not gen_text.strip():
106
- return None, "❌ Debes escribir el texto a generar", ""
107
 
108
- # Verificar que los modelos estén cargados
109
  if not model_loaded:
110
  success = load_models()
111
  if not success:
112
- return None, "❌ Error cargando modelos. Intenta recargar la página.", ""
113
 
114
  try:
115
  start_time = time.time()
116
 
117
- print(f"🎤 Generando audio...")
118
  print(f" Ref text: {ref_text[:50]}...")
119
  print(f" Gen text: {gen_text[:50]}...")
120
 
121
- # Preprocesar audio de referencia
122
  ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
123
  reference_audio,
124
  ref_text
125
  )
126
 
127
- # Procesar con F5-TTS (igual que el código oficial)
128
  final_wave, final_sample_rate, combined_spectrogram = infer_process(
129
  ref_audio=ref_audio_processed,
130
  ref_text=ref_text_processed,
@@ -136,73 +136,73 @@ def generate_voice(reference_audio, ref_text, gen_text, language):
136
  end_time = time.time()
137
  processing_time = end_time - start_time
138
 
139
- # result debería ser el audio generado
140
  output_path = "generated_audio.wav"
141
 
142
- success_msg = f"✅ Audio generado exitosamente"
143
- time_msg = f"⏱️ Tiempo: {processing_time:.2f}s"
144
 
145
  return (final_sample_rate, final_wave), success_msg, time_msg
146
 
147
  except Exception as e:
148
- print(f"❌ Error en generación: {e}")
149
  import traceback
150
  traceback.print_exc()
151
  return None, f"❌ Error: {str(e)}", ""
152
 
153
  def generate_voice_with_steps(reference_audio, ref_text, gen_text, language):
154
- """Generar voz capturando pasos intermedios del denoising"""
155
 
156
- # Validar entrada
157
  is_valid, msg = validate_audio(reference_audio)
158
  if not is_valid:
159
  return None, None, f"❌ {msg}"
160
 
161
  if not ref_text or not ref_text.strip():
162
- return None, None, "❌ Debes escribir la transcripción del audio de referencia"
163
 
164
  if not gen_text or not gen_text.strip():
165
- return None, None, "❌ Debes escribir el texto a generar"
166
 
167
- # Verificar que los modelos estén cargados
168
  if not model_loaded:
169
  success = load_models()
170
  if not success:
171
- return None, None, "❌ Error cargando modelos"
172
 
173
  try:
174
- print("🔬 Generando con captura de pasos intermedios...")
175
 
176
- # Preprocesar
177
  ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
178
  reference_audio,
179
  ref_text
180
  )
181
 
182
- # Cargar y procesar audio
183
  audio, sr = torchaudio.load(ref_audio_processed)
184
  if audio.shape[0] > 1:
185
  audio = torch.mean(audio, dim=0, keepdim=True)
186
 
187
- # Resamplear si es necesario
188
  if sr != 24000:
189
  resampler = torchaudio.transforms.Resample(sr, 24000)
190
  audio = resampler(audio)
191
 
192
  audio = audio.to("cpu")
193
 
194
- # Preparar texto
195
  text_list = [ref_text_processed + gen_text]
196
  final_text_list = convert_char_to_pinyin(text_list)
197
 
198
- # Calcular duración
199
  ref_audio_len = audio.shape[-1] // 256 # hop_length
200
  ref_text_len = len(ref_text_processed.encode("utf-8"))
201
  gen_text_len = len(gen_text.encode("utf-8"))
202
  duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len)
203
 
204
- # Generar CON trajectory
205
- print("Llamando a model.sample() con captura de trajectory...")
206
  with torch.inference_mode():
207
  generated_mel, trajectory = model.sample(
208
  cond=audio,
@@ -213,41 +213,41 @@ def generate_voice_with_steps(reference_audio, ref_text, gen_text, language):
213
  sway_sampling_coef=-1.0,
214
  )
215
 
216
- print(f"Trajectory capturado - Shape: {trajectory.shape}")
217
 
218
- # Extraer pasos específicos para mostrar
219
  steps_to_extract = [0, 8, 16, 24, 32]
220
  step_audios = []
221
 
222
  for step_idx in steps_to_extract:
223
- print(f"Procesando paso {step_idx}/32...")
224
  mel_at_step = trajectory[step_idx]
225
 
226
- # Recortar parte de referencia y permutar
227
  mel_generated = mel_at_step[:, ref_audio_len:, :]
228
  mel_generated = mel_generated.permute(0, 2, 1)
229
 
230
- # Convertir a audio con vocoder
231
  audio_at_step = vocoder.decode(mel_generated)
232
  audio_np = audio_at_step.squeeze().cpu().numpy()
233
 
234
  step_audios.append((24000, audio_np))
235
 
236
- # El último paso es el audio final
237
  final_audio = step_audios[-1]
238
 
239
- print("✅ Generación con pasos completada")
240
 
241
- # Retornar: audio final, lista de pasos, mensaje
242
- return final_audio, step_audios, f"✅ Generado con captura de {len(steps_to_extract)} pasos intermedios"
243
 
244
  except Exception as e:
245
- print(f"❌ Error en generación con pasos: {e}")
246
  import traceback
247
  traceback.print_exc()
248
- return None, None, f"❌ Error: {str(e)}"
249
-
250
  # Crear interfaz Gradio
 
251
  def create_interface():
252
  with gr.Blocks(
253
  title="F5-TTS Voice Cloning",
@@ -255,52 +255,54 @@ def create_interface():
255
  ) as demo:
256
 
257
  gr.Markdown("# 🎤 F5-TTS Voice Cloning")
258
- gr.Markdown("Clona cualquier voz con solo 5-30 segundos de audio de referencia")
259
-
 
 
260
  with gr.Tabs():
261
  # Tab 1: Generación básica
262
- with gr.Tab("Generación Básica"):
263
  with gr.Row():
264
  with gr.Column(scale=1):
265
- gr.Markdown("## 📁 Entrada")
266
 
267
  reference_audio = gr.Audio(
268
- label="Audio de Referencia (5-30 segundos)",
269
  type="filepath",
270
  sources=["upload", "microphone"]
271
  )
272
 
273
  ref_text = gr.Textbox(
274
- label="Transcripción del Audio de Referencia",
275
- placeholder="Escribe exactamente lo que dice el audio de referencia...",
276
  lines=2,
277
- info="Importante: Debe coincidir con lo que dice el audio"
278
  )
279
 
280
  gen_text = gr.Textbox(
281
- label="Texto a Generar",
282
- placeholder="Escribe el texto que quieres que diga con la voz clonada...",
283
  lines=3
284
  )
285
 
286
  language = gr.Dropdown(
287
  choices=SUPPORTED_LANGUAGES,
288
- value="es",
289
- label="Idioma",
290
- info="Idioma del texto a generar"
291
  )
292
 
293
- generate_btn = gr.Button("🚀 Generar Voz", variant="primary", size="lg")
294
 
295
  with gr.Row():
296
- status_msg = gr.Textbox(label="Estado", interactive=False, show_label=False)
297
 
298
  with gr.Row():
299
- time_msg = gr.Textbox(label="Tiempo de Procesamiento", interactive=False)
300
 
301
  with gr.Row():
302
- output_audio = gr.Audio(label="🔊 Audio Generado", type="filepath")
303
-
304
  generate_btn.click(
305
  fn=generate_voice,
306
  inputs=[reference_audio, ref_text, gen_text, language],
@@ -308,53 +310,53 @@ def create_interface():
308
  )
309
 
310
  # Tab 2: Visualización del proceso de denoising
311
- with gr.Tab("Visualización del Denoising"):
312
  gr.Markdown("""
313
- ## 🔬 Visualización del Proceso de Denoising
314
 
315
- Esta sección te permite ver cómo el modelo transforma ruido puro en audio limpio paso a paso.
316
- El modelo F5-TTS usa 32 pasos de "denoising" para generar el audio final.
317
  """)
318
-
319
  with gr.Row():
320
  with gr.Column(scale=1):
321
- gr.Markdown("### Entrada")
322
 
323
  ref_audio_steps = gr.Audio(
324
- label="Audio de Referencia",
325
  type="filepath",
326
  sources=["upload", "microphone"]
327
  )
328
 
329
  ref_text_steps = gr.Textbox(
330
- label="Transcripción",
331
  lines=2
332
  )
333
 
334
  gen_text_steps = gr.Textbox(
335
- label="Texto a Generar",
336
  lines=3
337
  )
338
 
339
  language_steps = gr.Dropdown(
340
  choices=SUPPORTED_LANGUAGES,
341
  value="es",
342
- label="Idioma"
343
  )
344
 
345
  generate_steps_btn = gr.Button(
346
- "🔬 Generar con Captura de Pasos",
347
  variant="primary"
348
  )
349
 
350
  with gr.Row():
351
- status_steps = gr.Textbox(label="Estado", interactive=False)
352
 
353
  with gr.Row():
354
- gr.Markdown("### Audio Final")
355
- final_audio_output = gr.Audio(label="Resultado Final", type="numpy")
356
 
357
- gr.Markdown("### Pasos Intermedios del Denoising")
358
 
359
  with gr.Row():
360
  step_slider = gr.Slider(
@@ -362,17 +364,17 @@ def create_interface():
362
  maximum=4,
363
  value=4,
364
  step=1,
365
- label="Seleccionar Paso",
366
- info="0=Ruido inicial, 1=Paso 8, 2=Paso 16, 3=Paso 24, 4=Paso 32 (final)"
367
  )
368
 
369
  with gr.Row():
370
  step_audio = gr.Audio(
371
- label="Audio en el Paso Seleccionado",
372
  type="numpy"
373
  )
374
 
375
- # Estado oculto para guardar todos los pasos
376
  all_steps_state = gr.State(value=None)
377
 
378
  def update_step_audio(step_index, all_steps):
@@ -380,12 +382,12 @@ def create_interface():
380
  return None
381
  return all_steps[int(step_index)]
382
 
383
- # Generar y guardar pasos
384
  def process_with_steps(ref_audio, ref_text, gen_text, lang):
385
  final, steps, status = generate_voice_with_steps(
386
  ref_audio, ref_text, gen_text, lang
387
  )
388
- # Solo devolver 4 valores si steps existe
389
  if steps:
390
  return final, steps, steps[-1], status
391
  else:
@@ -402,43 +404,42 @@ def create_interface():
402
  inputs=[step_slider, all_steps_state],
403
  outputs=[step_audio]
404
  )
405
-
406
  gr.Markdown("""
407
- ### 📊 Explicación de los Pasos
408
 
409
- - **Paso 0 (Ruido)**: Ruido aleatorio puro - el punto de partida
410
- - **Paso 8**: Primeras estructuras emergen, muy distorsionado
411
- - **Paso 16**: Se distinguen patrones de habla, aún con artefactos
412
- - **Paso 24**: Audio casi limpio, algunas imperfecciones
413
- - **Paso 32 (Final)**: Audio completamente limpio y natural
414
 
415
- Este proceso se llama "diffusion" - el modelo aprende a "limpiar" ruido gradualmente.
416
  """)
417
-
418
  gr.Markdown("""
419
- ## 💡 Consejos para Mejores Resultados
420
 
421
- - **Audio limpio:** Sin ruido de fondo, música o eco
422
- - **Duración:** 5-30 segundos es ideal
423
- - **Transcripción exacta:** La transcripción debe coincidir exactamente con el audio
424
- - **Habla clara:** Volumen constante y pronunciación clara
425
- - **Idioma:** El audio de referencia y el texto pueden estar en idiomas diferentes
426
 
427
- ## 🔧 Información Técnica
428
 
429
- - **Modelo:** F5-TTS (Flow Matching Text-to-Speech)
430
  - **Vocoder:** Vocos
431
- - **Dispositivo:** CPU (puede tardar ~30-60 segundos)
432
  """)
433
 
434
  return demo
435
 
436
  if __name__ == "__main__":
437
- # Pre-cargar modelos al iniciar (opcional, mejora primera experiencia)
438
- print("🚀 Iniciando F5-TTS Voice Cloning App")
439
  print("=" * 50)
440
 
441
- # Comentar la siguiente línea si quieres carga bajo demanda
442
  # load_models()
443
 
444
  demo = create_interface()
 
14
 
15
  # Configuración
16
  MODEL_NAME = "F5-TTS"
17
+ SUPPORTED_LANGUAGES = ["en", "es"]
18
  MAX_AUDIO_SIZE = 10 * 1024 * 1024 # 10MB
19
 
20
  # Variables globales para el modelo (se cargan una vez)
 
23
  model_loaded = False
24
 
25
  def load_models():
26
+ """Load F5-TTS and vocoder (only once at startup)"""
27
  global model, vocoder, model_loaded
28
 
29
  if model_loaded:
30
  return True
31
 
32
  try:
33
+ print("⏳ Loading F5-TTS and vocoder...")
34
  print("=" * 50)
35
 
36
+ # Load vocoder first
37
+ print("🔥 Loading Vocos vocoder...")
38
  vocoder = load_vocoder(
39
  vocoder_name="vocos",
40
  is_local=False,
41
  device="cpu"
42
  )
43
+ print("✅ Vocoder loaded successfully")
44
 
45
+ # Model configuration (copied from official code)
46
+ print("\n🔥 Loading F5-TTS v1 Base model...")
47
 
48
  ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors"))
49
  model_cfg = dict(
 
55
  conv_layers=4
56
  )
57
 
58
+ # Load model using the same function as the official code
59
  model = load_model(
60
  DiT,
61
  model_cfg,
62
  ckpt_path
63
  )
64
+ print("✅ F5-TTS model loaded successfully")
65
 
66
  model_loaded = True
67
  print("\n" + "=" * 50)
68
+ print("✅ All models loaded successfully")
69
  return True
70
 
71
  except Exception as e:
72
+ print(f"\n❌ CRITICAL ERROR loading models:")
73
+ print(f" Type: {type(e).__name__}")
74
+ print(f" Message: {str(e)}")
75
  import traceback
76
+ print("\nFull stack trace:")
77
  traceback.print_exc()
78
  print("=" * 50)
79
  return False
80
 
81
  def validate_audio(audio_file):
82
+ """Validate audio file"""
83
  if audio_file is None:
84
+ return False, "Please upload an audio file"
85
 
86
  try:
87
  file_size = os.path.getsize(audio_file)
88
  if file_size > MAX_AUDIO_SIZE:
89
+ return False, f"File too large. Maximum 10MB"
90
+ return True, "Valid audio"
91
  except Exception as e:
92
+ return False, f"Error validating audio: {e}"
93
 
94
  def generate_voice(reference_audio, ref_text, gen_text, language):
95
+ """Generate voice with F5-TTS"""
96
 
97
+ # Validate input
98
  is_valid, msg = validate_audio(reference_audio)
99
  if not is_valid:
100
  return None, f"❌ {msg}", ""
101
 
102
  if not ref_text or not ref_text.strip():
103
+ return None, "❌ You must write the transcription of the reference audio", ""
104
 
105
  if not gen_text or not gen_text.strip():
106
+ return None, "❌ You must write the text to generate", ""
107
 
108
+ # Check that models are loaded
109
  if not model_loaded:
110
  success = load_models()
111
  if not success:
112
+ return None, "❌ Error loading models. Try reloading the page.", ""
113
 
114
  try:
115
  start_time = time.time()
116
 
117
+ print(f"🎤 Generating audio...")
118
  print(f" Ref text: {ref_text[:50]}...")
119
  print(f" Gen text: {gen_text[:50]}...")
120
 
121
+ # Preprocess reference audio
122
  ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
123
  reference_audio,
124
  ref_text
125
  )
126
 
127
+ # Process with F5-TTS (same as official code)
128
  final_wave, final_sample_rate, combined_spectrogram = infer_process(
129
  ref_audio=ref_audio_processed,
130
  ref_text=ref_text_processed,
 
136
  end_time = time.time()
137
  processing_time = end_time - start_time
138
 
139
+ # result should be the generated audio
140
  output_path = "generated_audio.wav"
141
 
142
+ success_msg = f"✅ Audio generated successfully"
143
+ time_msg = f"⏱️ Time: {processing_time:.2f}s"
144
 
145
  return (final_sample_rate, final_wave), success_msg, time_msg
146
 
147
  except Exception as e:
148
+ print(f"❌ Error in generation: {e}")
149
  import traceback
150
  traceback.print_exc()
151
  return None, f"❌ Error: {str(e)}", ""
152
 
153
  def generate_voice_with_steps(reference_audio, ref_text, gen_text, language):
154
+ """Generate voice capturing intermediate denoising steps"""
155
 
156
+ # Validate input
157
  is_valid, msg = validate_audio(reference_audio)
158
  if not is_valid:
159
  return None, None, f"❌ {msg}"
160
 
161
  if not ref_text or not ref_text.strip():
162
+ return None, None, "❌ You must write the transcription of the reference audio"
163
 
164
  if not gen_text or not gen_text.strip():
165
+ return None, None, "❌ You must write the text to generate"
166
 
167
+ # Check that models are loaded
168
  if not model_loaded:
169
  success = load_models()
170
  if not success:
171
+ return None, None, "❌ Error loading models"
172
 
173
  try:
174
+ print("🔬 Generating with intermediate step capture...")
175
 
176
+ # Preprocess
177
  ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
178
  reference_audio,
179
  ref_text
180
  )
181
 
182
+ # Load and process audio
183
  audio, sr = torchaudio.load(ref_audio_processed)
184
  if audio.shape[0] > 1:
185
  audio = torch.mean(audio, dim=0, keepdim=True)
186
 
187
+ # Resample if necessary
188
  if sr != 24000:
189
  resampler = torchaudio.transforms.Resample(sr, 24000)
190
  audio = resampler(audio)
191
 
192
  audio = audio.to("cpu")
193
 
194
+ # Prepare text
195
  text_list = [ref_text_processed + gen_text]
196
  final_text_list = convert_char_to_pinyin(text_list)
197
 
198
+ # Calculate duration
199
  ref_audio_len = audio.shape[-1] // 256 # hop_length
200
  ref_text_len = len(ref_text_processed.encode("utf-8"))
201
  gen_text_len = len(gen_text.encode("utf-8"))
202
  duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len)
203
 
204
+ # Generate WITH trajectory
205
+ print("Calling model.sample() with trajectory capture...")
206
  with torch.inference_mode():
207
  generated_mel, trajectory = model.sample(
208
  cond=audio,
 
213
  sway_sampling_coef=-1.0,
214
  )
215
 
216
+ print(f"Trajectory captured - Shape: {trajectory.shape}")
217
 
218
+ # Extract specific steps to display
219
  steps_to_extract = [0, 8, 16, 24, 32]
220
  step_audios = []
221
 
222
  for step_idx in steps_to_extract:
223
+ print(f"Processing step {step_idx}/32...")
224
  mel_at_step = trajectory[step_idx]
225
 
226
+ # Crop reference part and permute
227
  mel_generated = mel_at_step[:, ref_audio_len:, :]
228
  mel_generated = mel_generated.permute(0, 2, 1)
229
 
230
+ # Convert to audio with vocoder
231
  audio_at_step = vocoder.decode(mel_generated)
232
  audio_np = audio_at_step.squeeze().cpu().numpy()
233
 
234
  step_audios.append((24000, audio_np))
235
 
236
+ # The last step is the final audio
237
  final_audio = step_audios[-1]
238
 
239
+ print("✅ Generation with steps completed")
240
 
241
+ # Return: final audio, list of steps, message
242
+ return final_audio, step_audios, f"✅ Generated with capture of {len(steps_to_extract)} intermediate steps"
243
 
244
  except Exception as e:
245
+ print(f"❌ Error in generation with steps: {e}")
246
  import traceback
247
  traceback.print_exc()
248
+ return None, None, f"❌ Error: {str(e)}"
 
249
  # Crear interfaz Gradio
250
+
251
  def create_interface():
252
  with gr.Blocks(
253
  title="F5-TTS Voice Cloning",
 
255
  ) as demo:
256
 
257
  gr.Markdown("# 🎤 F5-TTS Voice Cloning")
258
+ gr.Markdown("Clone any voice with just 5-30 seconds of reference audio")
259
+ gr.Markdown("Developed by Noel Triguero. Model by SWivid")
260
+ gr.Markdown("---")
261
+
262
  with gr.Tabs():
263
  # Tab 1: Generación básica
264
+ with gr.Tab("Basic Generation"):
265
  with gr.Row():
266
  with gr.Column(scale=1):
267
+ gr.Markdown("## 📁 Input")
268
 
269
  reference_audio = gr.Audio(
270
+ label="Reference Audio (5-30 segundos)",
271
  type="filepath",
272
  sources=["upload", "microphone"]
273
  )
274
 
275
  ref_text = gr.Textbox(
276
+ label="Reference Audio Transcription",
277
+ placeholder="Write exactly what the reference audio says...",
278
  lines=2,
279
+ info="Important: Must match what the audio says"
280
  )
281
 
282
  gen_text = gr.Textbox(
283
+ label="Text to Generate",
284
+ placeholder="Write the text you want to say with the cloned voice...",
285
  lines=3
286
  )
287
 
288
  language = gr.Dropdown(
289
  choices=SUPPORTED_LANGUAGES,
290
+ value="en",
291
+ label="Language",
292
+ info="Language of the text to generate"
293
  )
294
 
295
+ generate_btn = gr.Button("🚀 Generate Voice", variant="primary", size="lg")
296
 
297
  with gr.Row():
298
+ status_msg = gr.Textbox(label="Status", interactive=False, show_label=False)
299
 
300
  with gr.Row():
301
+ time_msg = gr.Textbox(label="Processing Time", interactive=False)
302
 
303
  with gr.Row():
304
+ output_audio = gr.Audio(label="🔊 Generated Audio", type="filepath")
305
+
306
  generate_btn.click(
307
  fn=generate_voice,
308
  inputs=[reference_audio, ref_text, gen_text, language],
 
310
  )
311
 
312
  # Tab 2: Visualización del proceso de denoising
313
+ with gr.Tab("Denoising Visualization"):
314
  gr.Markdown("""
315
+ ## 🔬 Denoising Process Visualization
316
 
317
+ This section lets you see how the model transforms pure noise into clean audio step by step.
318
+ The F5-TTS model uses 32 "denoising" steps to generate the final audio.
319
  """)
320
+
321
  with gr.Row():
322
  with gr.Column(scale=1):
323
+ gr.Markdown("### Input")
324
 
325
  ref_audio_steps = gr.Audio(
326
+ label="Reference Audio",
327
  type="filepath",
328
  sources=["upload", "microphone"]
329
  )
330
 
331
  ref_text_steps = gr.Textbox(
332
+ label="Transcription",
333
  lines=2
334
  )
335
 
336
  gen_text_steps = gr.Textbox(
337
+ label="Text to Generate",
338
  lines=3
339
  )
340
 
341
  language_steps = gr.Dropdown(
342
  choices=SUPPORTED_LANGUAGES,
343
  value="es",
344
+ label="Language"
345
  )
346
 
347
  generate_steps_btn = gr.Button(
348
+ "🔬 Generate with Step Capture",
349
  variant="primary"
350
  )
351
 
352
  with gr.Row():
353
+ status_steps = gr.Textbox(label="Status", interactive=False)
354
 
355
  with gr.Row():
356
+ gr.Markdown("### Final Audio ")
357
+ final_audio_output = gr.Audio(label="Final Result", type="numpy")
358
 
359
+ gr.Markdown("### Intermediate Denoising Steps")
360
 
361
  with gr.Row():
362
  step_slider = gr.Slider(
 
364
  maximum=4,
365
  value=4,
366
  step=1,
367
+ label="Select Step",
368
+ info="0=Initial noise, 1=Step 8, 2=Step 16, 3=Step 24, 4=Step 32 (final)"
369
  )
370
 
371
  with gr.Row():
372
  step_audio = gr.Audio(
373
+ label="Audio at Selected Step",
374
  type="numpy"
375
  )
376
 
377
+ # Hiden state to store all steps
378
  all_steps_state = gr.State(value=None)
379
 
380
  def update_step_audio(step_index, all_steps):
 
382
  return None
383
  return all_steps[int(step_index)]
384
 
385
+ # Generate with steps and store all steps in state
386
  def process_with_steps(ref_audio, ref_text, gen_text, lang):
387
  final, steps, status = generate_voice_with_steps(
388
  ref_audio, ref_text, gen_text, lang
389
  )
390
+ # Only return the last step audio for the slider
391
  if steps:
392
  return final, steps, steps[-1], status
393
  else:
 
404
  inputs=[step_slider, all_steps_state],
405
  outputs=[step_audio]
406
  )
407
+
408
  gr.Markdown("""
409
+ ### 📊 Step Explanation
410
 
411
+ - **Step 0 (Noise)**: Pure random noise - the starting point
412
+ - **Step 8**: First structures emerge, very distorted
413
+ - **Step 16**: Speech patterns distinguishable, still with artifacts
414
+ - **Step 24**: Almost clean audio, some imperfections
415
+ - **Step 32 (Final)**: Completely clean and natural audio
416
 
417
+ This process is called "diffusion" - the model learns to "clean" noise gradually.
418
  """)
 
419
  gr.Markdown("""
420
+ ## 💡 Tips for Better Results
421
 
422
+ - **Clean audio:** No background noise, music or echo
423
+ - **Duration:** 5-30 seconds is ideal
424
+ - **Exact transcription:** The transcription must match the audio exactly
425
+ - **Clear speech:** Constant volume and clear pronunciation
426
+ - **Language:** Reference audio and text can be in different languages
427
 
428
+ ## 🔧 Technical Information
429
 
430
+ - **Model:** F5-TTS (Flow Matching Text-to-Speech)
431
  - **Vocoder:** Vocos
432
+ - **Device:** CPU (may take ~30-60 seconds)
433
  """)
434
 
435
  return demo
436
 
437
  if __name__ == "__main__":
438
+ # Pre-load models at startup (optional, improves first experience)
439
+ print("🚀 Starting F5-TTS Voice Cloning App")
440
  print("=" * 50)
441
 
442
+ # Comment the following line if you want on-demand loading
443
  # load_models()
444
 
445
  demo = create_interface()
f5-tts_tests.ipynb CHANGED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "id": "3b5f11be",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "✅ Python: /mnt/c/Users/noel_/Desktop/TTS_HF/voice-clone-comparison/.venv/bin/python\n",
14
+ "✅ PyTorch: 2.8.0+cu128\n",
15
+ "✅ F5-TTS importado\n",
16
+ "\n",
17
+ "🔍 ¿Usando venv?: True\n"
18
+ ]
19
+ }
20
+ ],
21
+ "source": [
22
+ "import sys\n",
23
+ "import torch\n",
24
+ "import f5_tts\n",
25
+ "\n",
26
+ "print(f\"✅ Python: {sys.executable}\")\n",
27
+ "print(f\"✅ PyTorch: {torch.__version__}\")\n",
28
+ "print(f\"✅ F5-TTS importado\")\n",
29
+ "print(f\"\\n🔍 ¿Usando venv?: {'.venv' in sys.executable}\")"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 6,
35
+ "id": "fb178159",
36
+ "metadata": {},
37
+ "outputs": [
38
+ {
39
+ "name": "stdout",
40
+ "output_type": "stream",
41
+ "text": [
42
+ "🔍 Buscando módulos internos:\n",
43
+ "----------------------------------------\n",
44
+ "✅ f5_tts.infer.utils_infer\n",
45
+ " └─ Funciones: AudioSegment, CFM, ThreadPoolExecutor, Vocos, chunk_text\n",
46
+ "❌ f5_tts.model.model\n",
47
+ "✅ f5_tts.model.cfm\n",
48
+ " └─ Funciones: CFM, Callable, MelSpec, default, exists\n",
49
+ "❌ f5_tts.infer.infer_process\n"
50
+ ]
51
+ }
52
+ ],
53
+ "source": [
54
+ "# Intentar encontrar clases/funciones usables\n",
55
+ "submodules_v2 = [\n",
56
+ " 'f5_tts.infer.utils_infer',\n",
57
+ " 'f5_tts.model.model',\n",
58
+ " 'f5_tts.model.cfm',\n",
59
+ " 'f5_tts.infer.infer_process',\n",
60
+ "]\n",
61
+ "\n",
62
+ "print(\"🔍 Buscando módulos internos:\")\n",
63
+ "print(\"-\" * 40)\n",
64
+ "\n",
65
+ "for module_name in submodules_v2:\n",
66
+ " try:\n",
67
+ " mod = importlib.import_module(module_name)\n",
68
+ " print(f\"✅ {module_name}\")\n",
69
+ " \n",
70
+ " # Ver qué tiene dentro\n",
71
+ " funcs = [x for x in dir(mod) if not x.startswith('_') and callable(getattr(mod, x))]\n",
72
+ " if funcs:\n",
73
+ " print(f\" └─ Funciones: {', '.join(funcs[:5])}\")\n",
74
+ " \n",
75
+ " except Exception as e:\n",
76
+ " print(f\"❌ {module_name}\")"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 8,
82
+ "id": "14e9bbd7",
83
+ "metadata": {},
84
+ "outputs": [
85
+ {
86
+ "name": "stdout",
87
+ "output_type": "stream",
88
+ "text": [
89
+ "🔍 Todos los elementos de utils_infer:\n",
90
+ "----------------------------------------\n",
91
+ "\n",
92
+ "📚 FUNCIONES (17):\n",
93
+ " • chunk_text\n",
94
+ " • convert_char_to_pinyin\n",
95
+ " • files\n",
96
+ " • get_tokenizer\n",
97
+ " • hf_hub_download\n",
98
+ " • infer_batch_process\n",
99
+ " • infer_process\n",
100
+ " • initialize_asr_pipeline\n",
101
+ " • load_checkpoint\n",
102
+ " • load_model\n",
103
+ " • load_vocoder\n",
104
+ " • pipeline\n",
105
+ " • preprocess_ref_audio_text\n",
106
+ " • remove_silence_edges\n",
107
+ " • remove_silence_for_generated_wav\n",
108
+ " • save_spectrogram\n",
109
+ " • transcribe\n",
110
+ "\n",
111
+ "🏗️ CLASES (4):\n",
112
+ " • AudioSegment\n",
113
+ " • CFM\n",
114
+ " • ThreadPoolExecutor\n",
115
+ " • Vocos\n",
116
+ "\n",
117
+ "🔧 VARIABLES (29):\n",
118
+ " • asr_pipe (NoneType)\n",
119
+ " • cfg_strength (float)\n",
120
+ " • cross_fade_duration (float)\n",
121
+ " • device (str)\n",
122
+ " • fix_duration (NoneType)\n",
123
+ " • hashlib (module)\n",
124
+ " • hop_length (int)\n",
125
+ " • matplotlib (module)\n",
126
+ " • mel_spec_type (str)\n",
127
+ " • n_fft (int)\n"
128
+ ]
129
+ }
130
+ ],
131
+ "source": [
132
+ "from f5_tts.infer import utils_infer\n",
133
+ "\n",
134
+ "print(\"🔍 Todos los elementos de utils_infer:\")\n",
135
+ "print(\"-\" * 40)\n",
136
+ "\n",
137
+ "# Ver TODOS los no-privados\n",
138
+ "all_items = [x for x in dir(utils_infer) if not x.startswith('_')]\n",
139
+ "\n",
140
+ "# Categorizar por tipo\n",
141
+ "functions = []\n",
142
+ "classes = []\n",
143
+ "variables = []\n",
144
+ "\n",
145
+ "for item_name in all_items:\n",
146
+ " item = getattr(utils_infer, item_name)\n",
147
+ " item_type = type(item).__name__\n",
148
+ " \n",
149
+ " if item_type == 'function':\n",
150
+ " functions.append(item_name)\n",
151
+ " elif item_type == 'type':\n",
152
+ " classes.append(item_name)\n",
153
+ " else:\n",
154
+ " variables.append(f\"{item_name} ({item_type})\")\n",
155
+ "\n",
156
+ "print(f\"\\n📚 FUNCIONES ({len(functions)}):\")\n",
157
+ "for f in functions:\n",
158
+ " print(f\" • {f}\")\n",
159
+ "\n",
160
+ "print(f\"\\n🏗️ CLASES ({len(classes)}):\")\n",
161
+ "for c in classes:\n",
162
+ " print(f\" • {c}\")\n",
163
+ "\n",
164
+ "print(f\"\\n🔧 VARIABLES ({len(variables)}):\")\n",
165
+ "for v in variables[:10]: # Solo primeras 10\n",
166
+ " print(f\" • {v}\")"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 9,
172
+ "id": "f93a74b4",
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "📖 Documentación de infer_process:\n",
180
+ "==================================================\n",
181
+ "Help on function infer_process in module f5_tts.infer.utils_infer:\n",
182
+ "\n",
183
+ "infer_process(ref_audio, ref_text, gen_text, model_obj, vocoder, mel_spec_type='vocos', show_info=<built-in function print>, progress=<module 'tqdm' from '/mnt/c/Users/noel_/Desktop/TTS_HF/voice-clone-comparison/.venv/lib/python3.12/site-packages/tqdm/__init__.py'>, target_rms=0.1, cross_fade_duration=0.15, nfe_step=32, cfg_strength=2.0, sway_sampling_coef=-1.0, speed=1.0, fix_duration=None, device='cuda')\n",
184
+ "\n"
185
+ ]
186
+ }
187
+ ],
188
+ "source": [
189
+ "from f5_tts.infer.utils_infer import infer_process, load_model, load_vocoder\n",
190
+ "\n",
191
+ "print(\"📖 Documentación de infer_process:\")\n",
192
+ "print(\"=\" * 50)\n",
193
+ "help(infer_process)"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 10,
199
+ "id": "3c06230a",
200
+ "metadata": {},
201
+ "outputs": [
202
+ {
203
+ "name": "stdout",
204
+ "output_type": "stream",
205
+ "text": [
206
+ "\n",
207
+ "📖 Documentación de load_model:\n",
208
+ "==================================================\n",
209
+ "Help on function load_model in module f5_tts.infer.utils_infer:\n",
210
+ "\n",
211
+ "load_model(model_cls, model_cfg, ckpt_path, mel_spec_type='vocos', vocab_file='', ode_method='euler', use_ema=True, device='cuda')\n",
212
+ "\n"
213
+ ]
214
+ }
215
+ ],
216
+ "source": [
217
+ "print(\"\\n📖 Documentación de load_model:\")\n",
218
+ "print(\"=\" * 50)\n",
219
+ "help(load_model)"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": 11,
225
+ "id": "5dee84d6",
226
+ "metadata": {},
227
+ "outputs": [
228
+ {
229
+ "name": "stdout",
230
+ "output_type": "stream",
231
+ "text": [
232
+ "\n",
233
+ "📖 Documentación de load_vocoder:\n",
234
+ "==================================================\n",
235
+ "Help on function load_vocoder in module f5_tts.infer.utils_infer:\n",
236
+ "\n",
237
+ "load_vocoder(vocoder_name='vocos', is_local=False, local_path='', device='cuda', hf_cache_dir=None)\n",
238
+ " # load vocoder\n",
239
+ "\n"
240
+ ]
241
+ }
242
+ ],
243
+ "source": [
244
+ "print(\"\\n📖 Documentación de load_vocoder:\")\n",
245
+ "print(\"=\" * 50)\n",
246
+ "help(load_vocoder)"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "fc39776b",
253
+ "metadata": {},
254
+ "outputs": [
255
+ {
256
+ "name": "stdout",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "\n",
260
+ "📖 Documentación de load_model:\n",
261
+ "==================================================\n",
262
+ "Help on function load_model in module f5_tts.infer.utils_infer:\n",
263
+ "\n",
264
+ "load_model(model_cls, model_cfg, ckpt_path, mel_spec_type='vocos', vocab_file='', ode_method='euler', use_ema=True, device='cuda')\n",
265
+ "\n"
266
+ ]
267
+ }
268
+ ],
269
+ "source": [
270
+ "print(\"\\n📖 Documentación de load_model:\")\n",
271
+ "print(\"=\" * 50)\n",
272
+ "help(load_model)"
273
+ ]
274
+ }
275
+ ],
276
+ "metadata": {
277
+ "kernelspec": {
278
+ "display_name": ".venv",
279
+ "language": "python",
280
+ "name": "python3"
281
+ },
282
+ "language_info": {
283
+ "codemirror_mode": {
284
+ "name": "ipython",
285
+ "version": 3
286
+ },
287
+ "file_extension": ".py",
288
+ "mimetype": "text/x-python",
289
+ "name": "python",
290
+ "nbconvert_exporter": "python",
291
+ "pygments_lexer": "ipython3",
292
+ "version": "3.12.3"
293
+ }
294
+ },
295
+ "nbformat": 4,
296
+ "nbformat_minor": 5
297
+ }