Spaces:

notrito
/

voice-clone-models-comparison

Build error

App Files Files Community

notrito commited on Oct 3, 2025

Commit

31ff1bd

1 Parent(s): e588eb9

translation

Browse files

Files changed (2) hide show

app.py +120 -119
f5-tts_tests.ipynb +297 -0

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from f5_tts.infer.utils_infer import preprocess_ref_audio_text, convert_char_to_
 # Configuración
 MODEL_NAME = "F5-TTS"
-SUPPORTED_LANGUAGES = ["es", "en"]
 MAX_AUDIO_SIZE = 10 * 1024 * 1024  # 10MB
 # Variables globales para el modelo (se cargan una vez)
@@ -23,27 +23,27 @@ vocoder = None
 model_loaded = False
 def load_models():
-    """Cargar F5-TTS y vocoder (solo una vez al iniciar)"""
     global model, vocoder, model_loaded
     if model_loaded:
         return True
     try:
-        print("⏳ Cargando F5-TTS y vocoder...")
         print("=" * 50)
-        # Cargar vocoder primero
-        print("📥 Cargando vocoder Vocos...")
         vocoder = load_vocoder(
             vocoder_name="vocos",
             is_local=False,
             device="cpu"
         )
-        print("✅ Vocoder cargado correctamente")
-        # Configuración del modelo (copiado del código oficial)
-        print("\n📥 Cargando modelo F5-TTS v1 Base...")
         ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors"))
         model_cfg = dict(
@@ -55,76 +55,76 @@ def load_models():
             conv_layers=4
         )
-        # Cargar modelo usando la misma función que el código oficial
         model = load_model(
             DiT,
             model_cfg,
             ckpt_path
         )
-        print("✅ Modelo F5-TTS cargado correctamente")
         model_loaded = True
         print("\n" + "=" * 50)
-        print("✅ Todos los modelos cargados exitosamente")
         return True
     except Exception as e:
-        print(f"\n❌ ERROR CRÍTICO cargando modelos:")
-        print(f"   Tipo: {type(e).__name__}")
-        print(f"   Mensaje: {str(e)}")
         import traceback
-        print("\nStack trace completo:")
         traceback.print_exc()
         print("=" * 50)
         return False
 def validate_audio(audio_file):
-    """Validar archivo de audio"""
     if audio_file is None:
-        return False, "Por favor, sube un archivo de audio"
     try:
         file_size = os.path.getsize(audio_file)
         if file_size > MAX_AUDIO_SIZE:
-            return False, f"Archivo muy grande. Máximo 10MB"
-        return True, "Audio válido"
     except Exception as e:
-        return False, f"Error validando audio: {e}"
 def generate_voice(reference_audio, ref_text, gen_text, language):
-    """Generar voz con F5-TTS"""
-    # Validar entrada
     is_valid, msg = validate_audio(reference_audio)
     if not is_valid:
         return None, f"❌ {msg}", ""
     if not ref_text or not ref_text.strip():
-        return None, "❌ Debes escribir la transcripción del audio de referencia", ""
     if not gen_text or not gen_text.strip():
-        return None, "❌ Debes escribir el texto a generar", ""
-    # Verificar que los modelos estén cargados
     if not model_loaded:
         success = load_models()
         if not success:
-            return None, "❌ Error cargando modelos. Intenta recargar la página.", ""
     try:
         start_time = time.time()
-        print(f"🎤 Generando audio...")
         print(f"   Ref text: {ref_text[:50]}...")
         print(f"   Gen text: {gen_text[:50]}...")
-        # Preprocesar audio de referencia
         ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
             reference_audio,
             ref_text
         )
-        # Procesar con F5-TTS (igual que el código oficial)
         final_wave, final_sample_rate, combined_spectrogram = infer_process(
             ref_audio=ref_audio_processed,
             ref_text=ref_text_processed,
@@ -136,73 +136,73 @@ def generate_voice(reference_audio, ref_text, gen_text, language):
         end_time = time.time()
         processing_time = end_time - start_time
-        # result debería ser el audio generado
         output_path = "generated_audio.wav"
-        success_msg = f"✅ Audio generado exitosamente"
-        time_msg = f"⏱️ Tiempo: {processing_time:.2f}s"
         return (final_sample_rate, final_wave), success_msg, time_msg
     except Exception as e:
-        print(f"❌ Error en generación: {e}")
         import traceback
         traceback.print_exc()
         return None, f"❌ Error: {str(e)}", ""
 def generate_voice_with_steps(reference_audio, ref_text, gen_text, language):
-    """Generar voz capturando pasos intermedios del denoising"""
-    # Validar entrada
     is_valid, msg = validate_audio(reference_audio)
     if not is_valid:
         return None, None, f"❌ {msg}"
     if not ref_text or not ref_text.strip():
-        return None, None, "❌ Debes escribir la transcripción del audio de referencia"
     if not gen_text or not gen_text.strip():
-        return None, None, "❌ Debes escribir el texto a generar"
-    # Verificar que los modelos estén cargados
     if not model_loaded:
         success = load_models()
         if not success:
-            return None, None, "❌ Error cargando modelos"
     try:
-        print("🔬 Generando con captura de pasos intermedios...")
-        # Preprocesar
         ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
             reference_audio,
             ref_text
         )
-        # Cargar y procesar audio
         audio, sr = torchaudio.load(ref_audio_processed)
         if audio.shape[0] > 1:
             audio = torch.mean(audio, dim=0, keepdim=True)
-        # Resamplear si es necesario
         if sr != 24000:
             resampler = torchaudio.transforms.Resample(sr, 24000)
             audio = resampler(audio)
         audio = audio.to("cpu")
-        # Preparar texto
         text_list = [ref_text_processed + gen_text]
         final_text_list = convert_char_to_pinyin(text_list)
-        # Calcular duración
         ref_audio_len = audio.shape[-1] // 256  # hop_length
         ref_text_len = len(ref_text_processed.encode("utf-8"))
         gen_text_len = len(gen_text.encode("utf-8"))
         duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len)
-        # Generar CON trajectory
-        print("Llamando a model.sample() con captura de trajectory...")
         with torch.inference_mode():
             generated_mel, trajectory = model.sample(
                 cond=audio,
@@ -213,41 +213,41 @@ def generate_voice_with_steps(reference_audio, ref_text, gen_text, language):
                 sway_sampling_coef=-1.0,
             )
-        print(f"Trajectory capturado - Shape: {trajectory.shape}")
-        # Extraer pasos específicos para mostrar
         steps_to_extract = [0, 8, 16, 24, 32]
         step_audios = []
         for step_idx in steps_to_extract:
-            print(f"Procesando paso {step_idx}/32...")
             mel_at_step = trajectory[step_idx]
-            # Recortar parte de referencia y permutar
             mel_generated = mel_at_step[:, ref_audio_len:, :]
             mel_generated = mel_generated.permute(0, 2, 1)
-            # Convertir a audio con vocoder
             audio_at_step = vocoder.decode(mel_generated)
             audio_np = audio_at_step.squeeze().cpu().numpy()
             step_audios.append((24000, audio_np))
-        # El último paso es el audio final
         final_audio = step_audios[-1]
-        print("✅ Generación con pasos completada")
-        # Retornar: audio final, lista de pasos, mensaje
-        return final_audio, step_audios, f"✅ Generado con captura de {len(steps_to_extract)} pasos intermedios"
     except Exception as e:
-        print(f"❌ Error en generación con pasos: {e}")
         import traceback
         traceback.print_exc()
-        return None, None, f"❌ Error: {str(e)}"
 # Crear interfaz Gradio
 def create_interface():
     with gr.Blocks(
         title="F5-TTS Voice Cloning",
@@ -255,52 +255,54 @@ def create_interface():
     ) as demo:
         gr.Markdown("# 🎤 F5-TTS Voice Cloning")
-        gr.Markdown("Clona cualquier voz con solo 5-30 segundos de audio de referencia")
         with gr.Tabs():
             # Tab 1: Generación básica
-            with gr.Tab("Generación Básica"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        gr.Markdown("## 📁 Entrada")
                         reference_audio = gr.Audio(
-                            label="Audio de Referencia (5-30 segundos)",
                             type="filepath",
                             sources=["upload", "microphone"]
                         )
                         ref_text = gr.Textbox(
-                            label="Transcripción del Audio de Referencia",
-                            placeholder="Escribe exactamente lo que dice el audio de referencia...",
                             lines=2,
-                            info="Importante: Debe coincidir con lo que dice el audio"
                         )
                         gen_text = gr.Textbox(
-                            label="Texto a Generar",
-                            placeholder="Escribe el texto que quieres que diga con la voz clonada...",
                             lines=3
                         )
                         language = gr.Dropdown(
                             choices=SUPPORTED_LANGUAGES,
-                            value="es",
-                            label="Idioma",
-                            info="Idioma del texto a generar"
                         )
-                        generate_btn = gr.Button("🚀 Generar Voz", variant="primary", size="lg")
                 with gr.Row():
-                    status_msg = gr.Textbox(label="Estado", interactive=False, show_label=False)
                 with gr.Row():
-                    time_msg = gr.Textbox(label="Tiempo de Procesamiento", interactive=False)
                 with gr.Row():
-                    output_audio = gr.Audio(label="🔊 Audio Generado", type="filepath")
                 generate_btn.click(
                     fn=generate_voice,
                     inputs=[reference_audio, ref_text, gen_text, language],
@@ -308,53 +310,53 @@ def create_interface():
                 )
             # Tab 2: Visualización del proceso de denoising
-            with gr.Tab("Visualización del Denoising"):
                 gr.Markdown("""
-                ## 🔬 Visualización del Proceso de Denoising
-                Esta sección te permite ver cómo el modelo transforma ruido puro en audio limpio paso a paso.
-                El modelo F5-TTS usa 32 pasos de "denoising" para generar el audio final.
                 """)
                 with gr.Row():
                     with gr.Column(scale=1):
-                        gr.Markdown("### Entrada")
                         ref_audio_steps = gr.Audio(
-                            label="Audio de Referencia",
                             type="filepath",
                             sources=["upload", "microphone"]
                         )
                         ref_text_steps = gr.Textbox(
-                            label="Transcripción",
                             lines=2
                         )
                         gen_text_steps = gr.Textbox(
-                            label="Texto a Generar",
                             lines=3
                         )
                         language_steps = gr.Dropdown(
                             choices=SUPPORTED_LANGUAGES,
                             value="es",
-                            label="Idioma"
                         )
                         generate_steps_btn = gr.Button(
-                            "🔬 Generar con Captura de Pasos",
                             variant="primary"
                         )
                 with gr.Row():
-                    status_steps = gr.Textbox(label="Estado", interactive=False)
                 with gr.Row():
-                    gr.Markdown("### Audio Final")
-                    final_audio_output = gr.Audio(label="Resultado Final", type="numpy")
-                gr.Markdown("### Pasos Intermedios del Denoising")
                 with gr.Row():
                     step_slider = gr.Slider(
@@ -362,17 +364,17 @@ def create_interface():
                         maximum=4,
                         value=4,
                         step=1,
-                        label="Seleccionar Paso",
-                        info="0=Ruido inicial, 1=Paso 8, 2=Paso 16, 3=Paso 24, 4=Paso 32 (final)"
                     )
                 with gr.Row():
                     step_audio = gr.Audio(
-                        label="Audio en el Paso Seleccionado",
                         type="numpy"
                     )
-                # Estado oculto para guardar todos los pasos
                 all_steps_state = gr.State(value=None)
                 def update_step_audio(step_index, all_steps):
@@ -380,12 +382,12 @@ def create_interface():
                         return None
                     return all_steps[int(step_index)]
-                # Generar y guardar pasos
                 def process_with_steps(ref_audio, ref_text, gen_text, lang):
                     final, steps, status = generate_voice_with_steps(
                         ref_audio, ref_text, gen_text, lang
                     )
-                    # Solo devolver 4 valores si steps existe
                     if steps:
                         return final, steps, steps[-1], status
                     else:
@@ -402,43 +404,42 @@ def create_interface():
                     inputs=[step_slider, all_steps_state],
                     outputs=[step_audio]
                 )
                 gr.Markdown("""
-                ### 📊 Explicación de los Pasos
-                - **Paso 0 (Ruido)**: Ruido aleatorio puro - el punto de partida
-                - **Paso 8**: Primeras estructuras emergen, muy distorsionado
-                - **Paso 16**: Se distinguen patrones de habla, aún con artefactos
-                - **Paso 24**: Audio casi limpio, algunas imperfecciones
-                - **Paso 32 (Final)**: Audio completamente limpio y natural
-                Este proceso se llama "diffusion" - el modelo aprende a "limpiar" ruido gradualmente.
                 """)
         gr.Markdown("""
-        ## 💡 Consejos para Mejores Resultados
-        - **Audio limpio:** Sin ruido de fondo, música o eco
-        - **Duración:** 5-30 segundos es ideal
-        - **Transcripción exacta:** La transcripción debe coincidir exactamente con el audio
-        - **Habla clara:** Volumen constante y pronunciación clara
-        - **Idioma:** El audio de referencia y el texto pueden estar en idiomas diferentes
-        ## 🔧 Información Técnica
-        - **Modelo:** F5-TTS (Flow Matching Text-to-Speech)
         - **Vocoder:** Vocos
-        - **Dispositivo:** CPU (puede tardar ~30-60 segundos)
         """)
     return demo
 if __name__ == "__main__":
-    # Pre-cargar modelos al iniciar (opcional, mejora primera experiencia)
-    print("🚀 Iniciando F5-TTS Voice Cloning App")
     print("=" * 50)
-    # Comentar la siguiente línea si quieres carga bajo demanda
     # load_models()
     demo = create_interface()

 # Configuración
 MODEL_NAME = "F5-TTS"
+SUPPORTED_LANGUAGES = ["en", "es"]
 MAX_AUDIO_SIZE = 10 * 1024 * 1024  # 10MB
 # Variables globales para el modelo (se cargan una vez)
 model_loaded = False
 def load_models():
+    """Load F5-TTS and vocoder (only once at startup)"""
     global model, vocoder, model_loaded
     if model_loaded:
         return True
     try:
+        print("⏳ Loading F5-TTS and vocoder...")
         print("=" * 50)
+        # Load vocoder first
+        print("🔥 Loading Vocos vocoder...")
         vocoder = load_vocoder(
             vocoder_name="vocos",
             is_local=False,
             device="cpu"
         )
+        print("✅ Vocoder loaded successfully")
+        # Model configuration (copied from official code)
+        print("\n🔥 Loading F5-TTS v1 Base model...")
         ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors"))
         model_cfg = dict(
             conv_layers=4
         )
+        # Load model using the same function as the official code
         model = load_model(
             DiT,
             model_cfg,
             ckpt_path
         )
+        print("✅ F5-TTS model loaded successfully")
         model_loaded = True
         print("\n" + "=" * 50)
+        print("✅ All models loaded successfully")
         return True
     except Exception as e:
+        print(f"\n❌ CRITICAL ERROR loading models:")
+        print(f"   Type: {type(e).__name__}")
+        print(f"   Message: {str(e)}")
         import traceback
+        print("\nFull stack trace:")
         traceback.print_exc()
         print("=" * 50)
         return False
 def validate_audio(audio_file):
+    """Validate audio file"""
     if audio_file is None:
+        return False, "Please upload an audio file"
     try:
         file_size = os.path.getsize(audio_file)
         if file_size > MAX_AUDIO_SIZE:
+            return False, f"File too large. Maximum 10MB"
+        return True, "Valid audio"
     except Exception as e:
+        return False, f"Error validating audio: {e}"
 def generate_voice(reference_audio, ref_text, gen_text, language):
+    """Generate voice with F5-TTS"""
+    # Validate input
     is_valid, msg = validate_audio(reference_audio)
     if not is_valid:
         return None, f"❌ {msg}", ""
     if not ref_text or not ref_text.strip():
+        return None, "❌ You must write the transcription of the reference audio", ""
     if not gen_text or not gen_text.strip():
+        return None, "❌ You must write the text to generate", ""
+    # Check that models are loaded
     if not model_loaded:
         success = load_models()
         if not success:
+            return None, "❌ Error loading models. Try reloading the page.", ""
     try:
         start_time = time.time()
+        print(f"🎤 Generating audio...")
         print(f"   Ref text: {ref_text[:50]}...")
         print(f"   Gen text: {gen_text[:50]}...")
+        # Preprocess reference audio
         ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
             reference_audio,
             ref_text
         )
+        # Process with F5-TTS (same as official code)
         final_wave, final_sample_rate, combined_spectrogram = infer_process(
             ref_audio=ref_audio_processed,
             ref_text=ref_text_processed,
         end_time = time.time()
         processing_time = end_time - start_time
+        # result should be the generated audio
         output_path = "generated_audio.wav"
+        success_msg = f"✅ Audio generated successfully"
+        time_msg = f"⏱️ Time: {processing_time:.2f}s"
         return (final_sample_rate, final_wave), success_msg, time_msg
     except Exception as e:
+        print(f"❌ Error in generation: {e}")
         import traceback
         traceback.print_exc()
         return None, f"❌ Error: {str(e)}", ""
 def generate_voice_with_steps(reference_audio, ref_text, gen_text, language):
+    """Generate voice capturing intermediate denoising steps"""
+    # Validate input
     is_valid, msg = validate_audio(reference_audio)
     if not is_valid:
         return None, None, f"❌ {msg}"
     if not ref_text or not ref_text.strip():
+        return None, None, "❌ You must write the transcription of the reference audio"
     if not gen_text or not gen_text.strip():
+        return None, None, "❌ You must write the text to generate"
+    # Check that models are loaded
     if not model_loaded:
         success = load_models()
         if not success:
+            return None, None, "❌ Error loading models"
     try:
+        print("🔬 Generating with intermediate step capture...")
+        # Preprocess
         ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
             reference_audio,
             ref_text
         )
+        # Load and process audio
         audio, sr = torchaudio.load(ref_audio_processed)
         if audio.shape[0] > 1:
             audio = torch.mean(audio, dim=0, keepdim=True)
+        # Resample if necessary
         if sr != 24000:
             resampler = torchaudio.transforms.Resample(sr, 24000)
             audio = resampler(audio)
         audio = audio.to("cpu")
+        # Prepare text
         text_list = [ref_text_processed + gen_text]
         final_text_list = convert_char_to_pinyin(text_list)
+        # Calculate duration
         ref_audio_len = audio.shape[-1] // 256  # hop_length
         ref_text_len = len(ref_text_processed.encode("utf-8"))
         gen_text_len = len(gen_text.encode("utf-8"))
         duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len)
+        # Generate WITH trajectory
+        print("Calling model.sample() with trajectory capture...")
         with torch.inference_mode():
             generated_mel, trajectory = model.sample(
                 cond=audio,
                 sway_sampling_coef=-1.0,
             )
+        print(f"Trajectory captured - Shape: {trajectory.shape}")
+        # Extract specific steps to display
         steps_to_extract = [0, 8, 16, 24, 32]
         step_audios = []
         for step_idx in steps_to_extract:
+            print(f"Processing step {step_idx}/32...")
             mel_at_step = trajectory[step_idx]
+            # Crop reference part and permute
             mel_generated = mel_at_step[:, ref_audio_len:, :]
             mel_generated = mel_generated.permute(0, 2, 1)
+            # Convert to audio with vocoder
             audio_at_step = vocoder.decode(mel_generated)
             audio_np = audio_at_step.squeeze().cpu().numpy()
             step_audios.append((24000, audio_np))
+        # The last step is the final audio
         final_audio = step_audios[-1]
+        print("✅ Generation with steps completed")
+        # Return: final audio, list of steps, message
+        return final_audio, step_audios, f"✅ Generated with capture of {len(steps_to_extract)} intermediate steps"
     except Exception as e:
+        print(f"❌ Error in generation with steps: {e}")
         import traceback
         traceback.print_exc()
+        return None, None, f"❌ Error: {str(e)}"
 # Crear interfaz Gradio
 def create_interface():
     with gr.Blocks(
         title="F5-TTS Voice Cloning",
     ) as demo:
         gr.Markdown("# 🎤 F5-TTS Voice Cloning")
+        gr.Markdown("Clone any voice with just 5-30 seconds of reference audio")
+        gr.Markdown("Developed by Noel Triguero. Model by SWivid")
+        gr.Markdown("---")
         with gr.Tabs():
             # Tab 1: Generación básica
+            with gr.Tab("Basic Generation"):
                 with gr.Row():
                     with gr.Column(scale=1):
+                        gr.Markdown("## 📁 Input")
                         reference_audio = gr.Audio(
+                            label="Reference Audio (5-30 segundos)",
                             type="filepath",
                             sources=["upload", "microphone"]
                         )
                         ref_text = gr.Textbox(
+                            label="Reference Audio Transcription",
+                            placeholder="Write exactly what the reference audio says...",
                             lines=2,
+                            info="Important: Must match what the audio says"
                         )
                         gen_text = gr.Textbox(
+                            label="Text to Generate",
+                            placeholder="Write the text you want to say with the cloned voice...",
                             lines=3
                         )
                         language = gr.Dropdown(
                             choices=SUPPORTED_LANGUAGES,
+                            value="en",
+                            label="Language",
+                            info="Language of the text to generate"
                         )
+                        generate_btn = gr.Button("🚀 Generate Voice", variant="primary", size="lg")
                 with gr.Row():
+                    status_msg = gr.Textbox(label="Status", interactive=False, show_label=False)
                 with gr.Row():
+                    time_msg = gr.Textbox(label="Processing Time", interactive=False)
                 with gr.Row():
+                    output_audio = gr.Audio(label="🔊 Generated Audio", type="filepath")
                 generate_btn.click(
                     fn=generate_voice,
                     inputs=[reference_audio, ref_text, gen_text, language],
                 )
             # Tab 2: Visualización del proceso de denoising
+            with gr.Tab("Denoising Visualization"):
                 gr.Markdown("""
+                ## 🔬 Denoising Process Visualization
+                This section lets you see how the model transforms pure noise into clean audio step by step.
+                The F5-TTS model uses 32 "denoising" steps to generate the final audio.
                 """)
                 with gr.Row():
                     with gr.Column(scale=1):
+                        gr.Markdown("### Input")
                         ref_audio_steps = gr.Audio(
+                            label="Reference Audio",
                             type="filepath",
                             sources=["upload", "microphone"]
                         )
                         ref_text_steps = gr.Textbox(
+                            label="Transcription",
                             lines=2
                         )
                         gen_text_steps = gr.Textbox(
+                            label="Text to Generate",
                             lines=3
                         )
                         language_steps = gr.Dropdown(
                             choices=SUPPORTED_LANGUAGES,
                             value="es",
+                            label="Language"
                         )
                         generate_steps_btn = gr.Button(
+                            "🔬 Generate with Step Capture",
                             variant="primary"
                         )
                 with gr.Row():
+                    status_steps = gr.Textbox(label="Status", interactive=False)
                 with gr.Row():
+                    gr.Markdown("### Final Audio ")
+                    final_audio_output = gr.Audio(label="Final Result", type="numpy")
+                gr.Markdown("### Intermediate Denoising Steps")
                 with gr.Row():
                     step_slider = gr.Slider(
                         maximum=4,
                         value=4,
                         step=1,
+                        label="Select Step",
+                        info="0=Initial noise, 1=Step 8, 2=Step 16, 3=Step 24, 4=Step 32 (final)"
                     )
                 with gr.Row():
                     step_audio = gr.Audio(
+                        label="Audio at Selected Step",
                         type="numpy"
                     )
+                # Hiden state to store all steps
                 all_steps_state = gr.State(value=None)
                 def update_step_audio(step_index, all_steps):
                         return None
                     return all_steps[int(step_index)]
+                # Generate with steps and store all steps in state
                 def process_with_steps(ref_audio, ref_text, gen_text, lang):
                     final, steps, status = generate_voice_with_steps(
                         ref_audio, ref_text, gen_text, lang
                     )
+                    # Only return the last step audio for the slider
                     if steps:
                         return final, steps, steps[-1], status
                     else:
                     inputs=[step_slider, all_steps_state],
                     outputs=[step_audio]
                 )
                 gr.Markdown("""
+                ### 📊 Step Explanation
+                - **Step 0 (Noise)**: Pure random noise - the starting point
+                - **Step 8**: First structures emerge, very distorted
+                - **Step 16**: Speech patterns distinguishable, still with artifacts
+                - **Step 24**: Almost clean audio, some imperfections
+                - **Step 32 (Final)**: Completely clean and natural audio
+                This process is called "diffusion" - the model learns to "clean" noise gradually.
                 """)
         gr.Markdown("""
+        ## 💡 Tips for Better Results
+        - **Clean audio:** No background noise, music or echo
+        - **Duration:** 5-30 seconds is ideal
+        - **Exact transcription:** The transcription must match the audio exactly
+        - **Clear speech:** Constant volume and clear pronunciation
+        - **Language:** Reference audio and text can be in different languages
+        ## 🔧 Technical Information
+        - **Model:** F5-TTS (Flow Matching Text-to-Speech)
         - **Vocoder:** Vocos
+        - **Device:** CPU (may take ~30-60 seconds)
         """)
     return demo
 if __name__ == "__main__":
+    # Pre-load models at startup (optional, improves first experience)
+    print("🚀 Starting F5-TTS Voice Cloning App")
     print("=" * 50)
+    # Comment the following line if you want on-demand loading
     # load_models()
     demo = create_interface()

f5-tts_tests.ipynb CHANGED Viewed

	@@ -0,0 +1,297 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3b5f11be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Python: /mnt/c/Users/noel_/Desktop/TTS_HF/voice-clone-comparison/.venv/bin/python\n",
+      "✅ PyTorch: 2.8.0+cu128\n",
+      "✅ F5-TTS importado\n",
+      "\n",
+      "🔍 ¿Usando venv?: True\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import torch\n",
+    "import f5_tts\n",
+    "\n",
+    "print(f\"✅ Python: {sys.executable}\")\n",
+    "print(f\"✅ PyTorch: {torch.__version__}\")\n",
+    "print(f\"✅ F5-TTS importado\")\n",
+    "print(f\"\\n🔍 ¿Usando venv?: {'.venv' in sys.executable}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "fb178159",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🔍 Buscando módulos internos:\n",
+      "----------------------------------------\n",
+      "✅ f5_tts.infer.utils_infer\n",
+      "   └─ Funciones: AudioSegment, CFM, ThreadPoolExecutor, Vocos, chunk_text\n",
+      "❌ f5_tts.model.model\n",
+      "✅ f5_tts.model.cfm\n",
+      "   └─ Funciones: CFM, Callable, MelSpec, default, exists\n",
+      "❌ f5_tts.infer.infer_process\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Intentar encontrar clases/funciones usables\n",
+    "submodules_v2 = [\n",
+    "    'f5_tts.infer.utils_infer',\n",
+    "    'f5_tts.model.model',\n",
+    "    'f5_tts.model.cfm',\n",
+    "    'f5_tts.infer.infer_process',\n",
+    "]\n",
+    "\n",
+    "print(\"🔍 Buscando módulos internos:\")\n",
+    "print(\"-\" * 40)\n",
+    "\n",
+    "for module_name in submodules_v2:\n",
+    "    try:\n",
+    "        mod = importlib.import_module(module_name)\n",
+    "        print(f\"✅ {module_name}\")\n",
+    "        \n",
+    "        # Ver qué tiene dentro\n",
+    "        funcs = [x for x in dir(mod) if not x.startswith('_') and callable(getattr(mod, x))]\n",
+    "        if funcs:\n",
+    "            print(f\"   └─ Funciones: {', '.join(funcs[:5])}\")\n",
+    "            \n",
+    "    except Exception as e:\n",
+    "        print(f\"❌ {module_name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "14e9bbd7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🔍 Todos los elementos de utils_infer:\n",
+      "----------------------------------------\n",
+      "\n",
+      "📚 FUNCIONES (17):\n",
+      "   • chunk_text\n",
+      "   • convert_char_to_pinyin\n",
+      "   • files\n",
+      "   • get_tokenizer\n",
+      "   • hf_hub_download\n",
+      "   • infer_batch_process\n",
+      "   • infer_process\n",
+      "   • initialize_asr_pipeline\n",
+      "   • load_checkpoint\n",
+      "   • load_model\n",
+      "   • load_vocoder\n",
+      "   • pipeline\n",
+      "   • preprocess_ref_audio_text\n",
+      "   • remove_silence_edges\n",
+      "   • remove_silence_for_generated_wav\n",
+      "   • save_spectrogram\n",
+      "   • transcribe\n",
+      "\n",
+      "🏗️ CLASES (4):\n",
+      "   • AudioSegment\n",
+      "   • CFM\n",
+      "   • ThreadPoolExecutor\n",
+      "   • Vocos\n",
+      "\n",
+      "🔧 VARIABLES (29):\n",
+      "   • asr_pipe (NoneType)\n",
+      "   • cfg_strength (float)\n",
+      "   • cross_fade_duration (float)\n",
+      "   • device (str)\n",
+      "   • fix_duration (NoneType)\n",
+      "   • hashlib (module)\n",
+      "   • hop_length (int)\n",
+      "   • matplotlib (module)\n",
+      "   • mel_spec_type (str)\n",
+      "   • n_fft (int)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from f5_tts.infer import utils_infer\n",
+    "\n",
+    "print(\"🔍 Todos los elementos de utils_infer:\")\n",
+    "print(\"-\" * 40)\n",
+    "\n",
+    "# Ver TODOS los no-privados\n",
+    "all_items = [x for x in dir(utils_infer) if not x.startswith('_')]\n",
+    "\n",
+    "# Categorizar por tipo\n",
+    "functions = []\n",
+    "classes = []\n",
+    "variables = []\n",
+    "\n",
+    "for item_name in all_items:\n",
+    "    item = getattr(utils_infer, item_name)\n",
+    "    item_type = type(item).__name__\n",
+    "    \n",
+    "    if item_type == 'function':\n",
+    "        functions.append(item_name)\n",
+    "    elif item_type == 'type':\n",
+    "        classes.append(item_name)\n",
+    "    else:\n",
+    "        variables.append(f\"{item_name} ({item_type})\")\n",
+    "\n",
+    "print(f\"\\n📚 FUNCIONES ({len(functions)}):\")\n",
+    "for f in functions:\n",
+    "    print(f\"   • {f}\")\n",
+    "\n",
+    "print(f\"\\n🏗️ CLASES ({len(classes)}):\")\n",
+    "for c in classes:\n",
+    "    print(f\"   • {c}\")\n",
+    "\n",
+    "print(f\"\\n🔧 VARIABLES ({len(variables)}):\")\n",
+    "for v in variables[:10]:  # Solo primeras 10\n",
+    "    print(f\"   • {v}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f93a74b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "📖 Documentación de infer_process:\n",
+      "==================================================\n",
+      "Help on function infer_process in module f5_tts.infer.utils_infer:\n",
+      "\n",
+      "infer_process(ref_audio, ref_text, gen_text, model_obj, vocoder, mel_spec_type='vocos', show_info=<built-in function print>, progress=<module 'tqdm' from '/mnt/c/Users/noel_/Desktop/TTS_HF/voice-clone-comparison/.venv/lib/python3.12/site-packages/tqdm/__init__.py'>, target_rms=0.1, cross_fade_duration=0.15, nfe_step=32, cfg_strength=2.0, sway_sampling_coef=-1.0, speed=1.0, fix_duration=None, device='cuda')\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from f5_tts.infer.utils_infer import infer_process, load_model, load_vocoder\n",
+    "\n",
+    "print(\"📖 Documentación de infer_process:\")\n",
+    "print(\"=\" * 50)\n",
+    "help(infer_process)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "3c06230a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "📖 Documentación de load_model:\n",
+      "==================================================\n",
+      "Help on function load_model in module f5_tts.infer.utils_infer:\n",
+      "\n",
+      "load_model(model_cls, model_cfg, ckpt_path, mel_spec_type='vocos', vocab_file='', ode_method='euler', use_ema=True, device='cuda')\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\n📖 Documentación de load_model:\")\n",
+    "print(\"=\" * 50)\n",
+    "help(load_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "5dee84d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "📖 Documentación de load_vocoder:\n",
+      "==================================================\n",
+      "Help on function load_vocoder in module f5_tts.infer.utils_infer:\n",
+      "\n",
+      "load_vocoder(vocoder_name='vocos', is_local=False, local_path='', device='cuda', hf_cache_dir=None)\n",
+      "    # load vocoder\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\n📖 Documentación de load_vocoder:\")\n",
+    "print(\"=\" * 50)\n",
+    "help(load_vocoder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc39776b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "📖 Documentación de load_model:\n",
+      "==================================================\n",
+      "Help on function load_model in module f5_tts.infer.utils_infer:\n",
+      "\n",
+      "load_model(model_cls, model_cfg, ckpt_path, mel_spec_type='vocos', vocab_file='', ode_method='euler', use_ema=True, device='cuda')\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\n📖 Documentación de load_model:\")\n",
+    "print(\"=\" * 50)\n",
+    "help(load_model)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}