import gradio as gr import time import os from pathlib import Path import json from cached_path import cached_path from f5_tts.infer.utils_infer import load_model, load_vocoder from f5_tts.model import DiT from f5_tts.infer.utils_infer import infer_process, preprocess_ref_audio_text import torch import torchaudio from f5_tts.infer.utils_infer import preprocess_ref_audio_text, convert_char_to_pinyin # ConfiguraciΓ³n MODEL_NAME = "F5-TTS" SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "zh"] MAX_AUDIO_SIZE = 10 * 1024 * 1024 # 10MB # Variables globales para el modelo (se cargan una vez) model = None vocoder = None model_loaded = False def load_models(): """Load F5-TTS and vocoder (only once at startup)""" global model, vocoder, model_loaded if model_loaded: return True try: print("⏳ Loading F5-TTS and vocoder...") print("=" * 50) # Load vocoder first print("πŸ”₯ Loading Vocos vocoder...") vocoder = load_vocoder( vocoder_name="vocos", is_local=False, device="cpu" ) print("βœ… Vocoder loaded successfully") # Model configuration (copied from official code) print("\nπŸ”₯ Loading F5-TTS v1 Base model...") ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors")) model_cfg = dict( dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4 ) # Load model using the same function as the official code model = load_model( DiT, model_cfg, ckpt_path ) print("βœ… F5-TTS model loaded successfully") model_loaded = True print("\n" + "=" * 50) print("βœ… All models loaded successfully") return True except Exception as e: print(f"\n❌ CRITICAL ERROR loading models:") print(f" Type: {type(e).__name__}") print(f" Message: {str(e)}") import traceback print("\nFull stack trace:") traceback.print_exc() print("=" * 50) return False def validate_audio(audio_file): """Validate audio file""" if audio_file is None: return False, "Please upload an audio file" try: file_size = os.path.getsize(audio_file) if file_size > MAX_AUDIO_SIZE: return False, f"File too large. Maximum 10MB" return True, "Valid audio" except Exception as e: return False, f"Error validating audio: {e}" # def generate_voice(reference_audio, ref_text, gen_text): # """Generate voice with F5-TTS""" # # Validate input # is_valid, msg = validate_audio(reference_audio) # if not is_valid: # return None, f"❌ {msg}", "" # if not ref_text or not ref_text.strip(): # return None, "❌ You must write the transcription of the reference audio", "" # if not gen_text or not gen_text.strip(): # return None, "❌ You must write the text to generate", "" # # Check that models are loaded # if not model_loaded: # success = load_models() # if not success: # return None, "❌ Error loading models. Try reloading the page.", "" # try: # start_time = time.time() # print(f"🎀 Generating audio...") # print(f" Ref text: {ref_text[:50]}...") # print(f" Gen text: {gen_text[:50]}...") # # Preprocess reference audio # ref_audio_processed, ref_text_processed = preprocess_ref_audio_text( # reference_audio, # ref_text # ) # # Process with F5-TTS (same as official code) # final_wave, final_sample_rate, combined_spectrogram = infer_process( # ref_audio=ref_audio_processed, # ref_text=ref_text_processed, # gen_text=gen_text, # model_obj=model, # vocoder=vocoder, # device="cpu" # ) # end_time = time.time() # processing_time = end_time - start_time # # result should be the generated audio # output_path = "generated_audio.wav" # success_msg = f"βœ… Audio generated successfully" # time_msg = f"⏱️ Time: {processing_time:.2f}s" # return (final_sample_rate, final_wave), success_msg, time_msg # except Exception as e: # print(f"❌ Error in generation: {e}") # import traceback # traceback.print_exc() # return None, f"❌ Error: {str(e)}", "" def generate_voice_with_steps(reference_audio, ref_text, gen_text): """Generate voice capturing intermediate denoising steps""" # Validate input is_valid, msg = validate_audio(reference_audio) if not is_valid: return None, None, f"❌ {msg}" if not ref_text or not ref_text.strip(): return None, None, "❌ You must write the transcription of the reference audio" if not gen_text or not gen_text.strip(): return None, None, "❌ You must write the text to generate" # Check that models are loaded if not model_loaded: success = load_models() if not success: return None, None, "❌ Error loading models" try: print("πŸ”¬ Generating with intermediate step capture...") # Preprocess ref_audio_processed, ref_text_processed = preprocess_ref_audio_text( reference_audio, ref_text ) # Load and process audio audio, sr = torchaudio.load(ref_audio_processed) if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) # Resample if necessary if sr != 24000: resampler = torchaudio.transforms.Resample(sr, 24000) audio = resampler(audio) audio = audio.to("cpu") # Prepare text text_list = [ref_text_processed + gen_text] final_text_list = convert_char_to_pinyin(text_list) # Calculate duration ref_audio_len = audio.shape[-1] // 256 # hop_length ref_text_len = len(ref_text_processed.encode("utf-8")) gen_text_len = len(gen_text.encode("utf-8")) duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len) # Generate WITH trajectory print("Calling model.sample() with trajectory capture...") with torch.inference_mode(): generated_mel, trajectory = model.sample( cond=audio, text=final_text_list, duration=duration, steps=32, cfg_strength=2.0, sway_sampling_coef=-1.0, ) print(f"Trajectory captured - Shape: {trajectory.shape}") # Extract specific steps to display steps_to_extract = [0, 12, 20, 26, 32] step_audios = [] for step_idx in steps_to_extract: print(f"Processing step {step_idx}/32...") mel_at_step = trajectory[step_idx] # Crop reference part and permute mel_generated = mel_at_step[:, ref_audio_len:, :] mel_generated = mel_generated.permute(0, 2, 1) # Convert to audio with vocoder audio_at_step = vocoder.decode(mel_generated) audio_np = audio_at_step.squeeze().cpu().numpy() step_audios.append((24000, audio_np)) # The last step is the final audio final_audio = step_audios[-1] print("βœ… Generation with steps completed") # Return: final audio, list of steps, message return final_audio, step_audios, f"βœ… Generated with capture of {len(steps_to_extract)} intermediate steps" except Exception as e: print(f"❌ Error in generation with steps: {e}") import traceback traceback.print_exc() return None, None, f"❌ Error: {str(e)}" # Crear interfaz Gradio def create_interface(): with gr.Blocks( title="F5-TTS Voice Cloning", theme=gr.themes.Soft() ) as demo: gr.Markdown("# 🎀 F5-TTS Voice Cloning and πŸ”¬ Denoising Process Visualization") gr.Markdown("Clone any voice with just 5-30 seconds of reference audio and see how noise transforms into speech step by step.") gr.Markdown("Developed by Noel Triguero. Model by SWivid") gr.Markdown("---") gr.Markdown(""" ## See how the model transforms pure noise into clean audio step by step. The F5-TTS model uses 32 "denoising" steps to generate the final audio. """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Input") ref_audio_steps = gr.Audio( label="Reference Audio", type="filepath", sources=["upload", "microphone"] ) with gr.Row(): ref_text_steps = gr.Textbox( label="Transcription", lines=2, scale=1 ) gen_text_steps = gr.Textbox( label="Text to Generate", lines=3, scale=1 ) with gr.Row(): generate_steps_btn = gr.Button( "πŸ”¬ Generate with Step Capture", variant="primary" ) with gr.Row(): status_steps = gr.Textbox(label="Status", interactive=False) gr.Markdown("### Intermediate Denoising Steps") with gr.Row(): step_slider = gr.Slider( minimum=0, maximum=4, value=4, step=1, label="Select Step", info="0=Initial noise, 1=Step 12, 2=Step 20, 3=Step 26, 4=Step 32 (final)\n (First 10 steps are noise for humans)" ) with gr.Row(): step_audio = gr.Audio( label="Audio at Selected Step", type="numpy" ) # Hiden state to store all steps all_steps_state = gr.State(value=None) def update_step_audio(step_index, all_steps): if all_steps is None: return None return all_steps[int(step_index)] # Generate with steps and store all steps in state def process_with_steps(ref_audio, ref_text, gen_text): final, steps, status = generate_voice_with_steps( ref_audio, ref_text, gen_text ) # Only return the last step audio for the slider if steps: return steps, steps[-1], status else: return None, None, status generate_steps_btn.click( fn=process_with_steps, inputs=[ref_audio_steps, ref_text_steps, gen_text_steps], outputs=[all_steps_state, step_audio, status_steps] ) step_slider.change( fn=update_step_audio, inputs=[step_slider, all_steps_state], outputs=[step_audio] ) gr.Markdown("
") # Espacio arriba gr.Markdown(""" --- ## πŸ’‘ Tips for Better Results **Clean audio:** No background noise, music or echo **Duration:** 5-30 seconds is ideal **Exact transcription:** The transcription must match the audio exactly **Clear speech:** Constant volume and clear pronunciation **Language:** Reference audio and text should be in english or chinese --- ## πŸ”§ Technical Information **Model:** F5-TTS (Flow Matching Text-to-Speech) **Vocoder:** Vocos **Device:** CPU (may take a while...) --- """) return demo if __name__ == "__main__": # Pre-load models at startup (optional, improves first experience) print("πŸš€ Starting F5-TTS Voice Cloning App") print("=" * 50) # Comment the following line if you want on-demand loading # load_models() demo = create_interface() demo.launch()