notrito's picture
bug
7da6710
import gradio as gr
import time
import os
from pathlib import Path
import json
from cached_path import cached_path
from f5_tts.infer.utils_infer import load_model, load_vocoder
from f5_tts.model import DiT
from f5_tts.infer.utils_infer import infer_process, preprocess_ref_audio_text
import torch
import torchaudio
from f5_tts.infer.utils_infer import preprocess_ref_audio_text, convert_char_to_pinyin
# Configuración
MODEL_NAME = "F5-TTS"
SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "zh"]
MAX_AUDIO_SIZE = 10 * 1024 * 1024 # 10MB
# Variables globales para el modelo (se cargan una vez)
model = None
vocoder = None
model_loaded = False
def load_models():
"""Load F5-TTS and vocoder (only once at startup)"""
global model, vocoder, model_loaded
if model_loaded:
return True
try:
print("⏳ Loading F5-TTS and vocoder...")
print("=" * 50)
# Load vocoder first
print("🔥 Loading Vocos vocoder...")
vocoder = load_vocoder(
vocoder_name="vocos",
is_local=False,
device="cpu"
)
print("✅ Vocoder loaded successfully")
# Model configuration (copied from official code)
print("\n🔥 Loading F5-TTS v1 Base model...")
ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors"))
model_cfg = dict(
dim=1024,
depth=22,
heads=16,
ff_mult=2,
text_dim=512,
conv_layers=4
)
# Load model using the same function as the official code
model = load_model(
DiT,
model_cfg,
ckpt_path
)
print("✅ F5-TTS model loaded successfully")
model_loaded = True
print("\n" + "=" * 50)
print("✅ All models loaded successfully")
return True
except Exception as e:
print(f"\n❌ CRITICAL ERROR loading models:")
print(f" Type: {type(e).__name__}")
print(f" Message: {str(e)}")
import traceback
print("\nFull stack trace:")
traceback.print_exc()
print("=" * 50)
return False
def validate_audio(audio_file):
"""Validate audio file"""
if audio_file is None:
return False, "Please upload an audio file"
try:
file_size = os.path.getsize(audio_file)
if file_size > MAX_AUDIO_SIZE:
return False, f"File too large. Maximum 10MB"
return True, "Valid audio"
except Exception as e:
return False, f"Error validating audio: {e}"
# def generate_voice(reference_audio, ref_text, gen_text):
# """Generate voice with F5-TTS"""
# # Validate input
# is_valid, msg = validate_audio(reference_audio)
# if not is_valid:
# return None, f"❌ {msg}", ""
# if not ref_text or not ref_text.strip():
# return None, "❌ You must write the transcription of the reference audio", ""
# if not gen_text or not gen_text.strip():
# return None, "❌ You must write the text to generate", ""
# # Check that models are loaded
# if not model_loaded:
# success = load_models()
# if not success:
# return None, "❌ Error loading models. Try reloading the page.", ""
# try:
# start_time = time.time()
# print(f"🎤 Generating audio...")
# print(f" Ref text: {ref_text[:50]}...")
# print(f" Gen text: {gen_text[:50]}...")
# # Preprocess reference audio
# ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
# reference_audio,
# ref_text
# )
# # Process with F5-TTS (same as official code)
# final_wave, final_sample_rate, combined_spectrogram = infer_process(
# ref_audio=ref_audio_processed,
# ref_text=ref_text_processed,
# gen_text=gen_text,
# model_obj=model,
# vocoder=vocoder,
# device="cpu"
# )
# end_time = time.time()
# processing_time = end_time - start_time
# # result should be the generated audio
# output_path = "generated_audio.wav"
# success_msg = f"✅ Audio generated successfully"
# time_msg = f"⏱️ Time: {processing_time:.2f}s"
# return (final_sample_rate, final_wave), success_msg, time_msg
# except Exception as e:
# print(f"❌ Error in generation: {e}")
# import traceback
# traceback.print_exc()
# return None, f"❌ Error: {str(e)}", ""
def generate_voice_with_steps(reference_audio, ref_text, gen_text):
"""Generate voice capturing intermediate denoising steps"""
# Validate input
is_valid, msg = validate_audio(reference_audio)
if not is_valid:
return None, None, f"❌ {msg}"
if not ref_text or not ref_text.strip():
return None, None, "❌ You must write the transcription of the reference audio"
if not gen_text or not gen_text.strip():
return None, None, "❌ You must write the text to generate"
# Check that models are loaded
if not model_loaded:
success = load_models()
if not success:
return None, None, "❌ Error loading models"
try:
print("🔬 Generating with intermediate step capture...")
# Preprocess
ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
reference_audio,
ref_text
)
# Load and process audio
audio, sr = torchaudio.load(ref_audio_processed)
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
# Resample if necessary
if sr != 24000:
resampler = torchaudio.transforms.Resample(sr, 24000)
audio = resampler(audio)
audio = audio.to("cpu")
# Prepare text
text_list = [ref_text_processed + gen_text]
final_text_list = convert_char_to_pinyin(text_list)
# Calculate duration
ref_audio_len = audio.shape[-1] // 256 # hop_length
ref_text_len = len(ref_text_processed.encode("utf-8"))
gen_text_len = len(gen_text.encode("utf-8"))
duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len)
# Generate WITH trajectory
print("Calling model.sample() with trajectory capture...")
with torch.inference_mode():
generated_mel, trajectory = model.sample(
cond=audio,
text=final_text_list,
duration=duration,
steps=32,
cfg_strength=2.0,
sway_sampling_coef=-1.0,
)
print(f"Trajectory captured - Shape: {trajectory.shape}")
# Extract specific steps to display
steps_to_extract = [0, 12, 20, 26, 32]
step_audios = []
for step_idx in steps_to_extract:
print(f"Processing step {step_idx}/32...")
mel_at_step = trajectory[step_idx]
# Crop reference part and permute
mel_generated = mel_at_step[:, ref_audio_len:, :]
mel_generated = mel_generated.permute(0, 2, 1)
# Convert to audio with vocoder
audio_at_step = vocoder.decode(mel_generated)
audio_np = audio_at_step.squeeze().cpu().numpy()
step_audios.append((24000, audio_np))
# The last step is the final audio
final_audio = step_audios[-1]
print("✅ Generation with steps completed")
# Return: final audio, list of steps, message
return final_audio, step_audios, f"✅ Generated with capture of {len(steps_to_extract)} intermediate steps"
except Exception as e:
print(f"❌ Error in generation with steps: {e}")
import traceback
traceback.print_exc()
return None, None, f"❌ Error: {str(e)}"
# Crear interfaz Gradio
def create_interface():
with gr.Blocks(
title="F5-TTS Voice Cloning",
theme=gr.themes.Soft()
) as demo:
gr.Markdown("# 🎤 F5-TTS Voice Cloning and 🔬 Denoising Process Visualization")
gr.Markdown("Clone any voice with just 5-30 seconds of reference audio and see how noise transforms into speech step by step.")
gr.Markdown("Developed by Noel Triguero. Model by SWivid")
gr.Markdown("---")
gr.Markdown("""
##
See how the model transforms pure noise into clean audio step by step.
The F5-TTS model uses 32 "denoising" steps to generate the final audio.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input")
ref_audio_steps = gr.Audio(
label="Reference Audio",
type="filepath",
sources=["upload", "microphone"]
)
with gr.Row():
ref_text_steps = gr.Textbox(
label="Transcription",
lines=2,
scale=1
)
gen_text_steps = gr.Textbox(
label="Text to Generate",
lines=3,
scale=1
)
with gr.Row():
generate_steps_btn = gr.Button(
"🔬 Generate with Step Capture",
variant="primary"
)
with gr.Row():
status_steps = gr.Textbox(label="Status", interactive=False)
gr.Markdown("### Intermediate Denoising Steps")
with gr.Row():
step_slider = gr.Slider(
minimum=0,
maximum=4,
value=4,
step=1,
label="Select Step",
info="0=Initial noise, 1=Step 12, 2=Step 20, 3=Step 26, 4=Step 32 (final)\n (First 10 steps are noise for humans)"
)
with gr.Row():
step_audio = gr.Audio(
label="Audio at Selected Step",
type="numpy"
)
# Hiden state to store all steps
all_steps_state = gr.State(value=None)
def update_step_audio(step_index, all_steps):
if all_steps is None:
return None
return all_steps[int(step_index)]
# Generate with steps and store all steps in state
def process_with_steps(ref_audio, ref_text, gen_text):
final, steps, status = generate_voice_with_steps(
ref_audio, ref_text, gen_text
)
# Only return the last step audio for the slider
if steps:
return steps, steps[-1], status
else:
return None, None, status
generate_steps_btn.click(
fn=process_with_steps,
inputs=[ref_audio_steps, ref_text_steps, gen_text_steps],
outputs=[all_steps_state, step_audio, status_steps]
)
step_slider.change(
fn=update_step_audio,
inputs=[step_slider, all_steps_state],
outputs=[step_audio]
)
gr.Markdown("<br>") # Espacio arriba
gr.Markdown("""
---
## 💡 Tips for Better Results
**Clean audio:** No background noise, music or echo
**Duration:** 5-30 seconds is ideal
**Exact transcription:** The transcription must match the audio exactly
**Clear speech:** Constant volume and clear pronunciation
**Language:** Reference audio and text should be in english or chinese
---
## 🔧 Technical Information
**Model:** F5-TTS (Flow Matching Text-to-Speech)
**Vocoder:** Vocos
**Device:** CPU (may take a while...)
---
""")
return demo
if __name__ == "__main__":
# Pre-load models at startup (optional, improves first experience)
print("🚀 Starting F5-TTS Voice Cloning App")
print("=" * 50)
# Comment the following line if you want on-demand loading
# load_models()
demo = create_interface()
demo.launch()