|
|
import gradio as gr |
|
|
import time |
|
|
import os |
|
|
from pathlib import Path |
|
|
import json |
|
|
from cached_path import cached_path |
|
|
from f5_tts.infer.utils_infer import load_model, load_vocoder |
|
|
from f5_tts.model import DiT |
|
|
from f5_tts.infer.utils_infer import infer_process, preprocess_ref_audio_text |
|
|
import torch |
|
|
import torchaudio |
|
|
from f5_tts.infer.utils_infer import preprocess_ref_audio_text, convert_char_to_pinyin |
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "F5-TTS" |
|
|
SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "zh"] |
|
|
MAX_AUDIO_SIZE = 10 * 1024 * 1024 |
|
|
|
|
|
|
|
|
model = None |
|
|
vocoder = None |
|
|
model_loaded = False |
|
|
|
|
|
def load_models(): |
|
|
"""Load F5-TTS and vocoder (only once at startup)""" |
|
|
global model, vocoder, model_loaded |
|
|
|
|
|
if model_loaded: |
|
|
return True |
|
|
|
|
|
try: |
|
|
print("⏳ Loading F5-TTS and vocoder...") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
print("🔥 Loading Vocos vocoder...") |
|
|
vocoder = load_vocoder( |
|
|
vocoder_name="vocos", |
|
|
is_local=False, |
|
|
device="cpu" |
|
|
) |
|
|
print("✅ Vocoder loaded successfully") |
|
|
|
|
|
|
|
|
print("\n🔥 Loading F5-TTS v1 Base model...") |
|
|
|
|
|
ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors")) |
|
|
model_cfg = dict( |
|
|
dim=1024, |
|
|
depth=22, |
|
|
heads=16, |
|
|
ff_mult=2, |
|
|
text_dim=512, |
|
|
conv_layers=4 |
|
|
) |
|
|
|
|
|
|
|
|
model = load_model( |
|
|
DiT, |
|
|
model_cfg, |
|
|
ckpt_path |
|
|
) |
|
|
print("✅ F5-TTS model loaded successfully") |
|
|
|
|
|
model_loaded = True |
|
|
print("\n" + "=" * 50) |
|
|
print("✅ All models loaded successfully") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\n❌ CRITICAL ERROR loading models:") |
|
|
print(f" Type: {type(e).__name__}") |
|
|
print(f" Message: {str(e)}") |
|
|
import traceback |
|
|
print("\nFull stack trace:") |
|
|
traceback.print_exc() |
|
|
print("=" * 50) |
|
|
return False |
|
|
|
|
|
def validate_audio(audio_file): |
|
|
"""Validate audio file""" |
|
|
if audio_file is None: |
|
|
return False, "Please upload an audio file" |
|
|
|
|
|
try: |
|
|
file_size = os.path.getsize(audio_file) |
|
|
if file_size > MAX_AUDIO_SIZE: |
|
|
return False, f"File too large. Maximum 10MB" |
|
|
return True, "Valid audio" |
|
|
except Exception as e: |
|
|
return False, f"Error validating audio: {e}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_voice_with_steps(reference_audio, ref_text, gen_text): |
|
|
"""Generate voice capturing intermediate denoising steps""" |
|
|
|
|
|
|
|
|
is_valid, msg = validate_audio(reference_audio) |
|
|
if not is_valid: |
|
|
return None, None, f"❌ {msg}" |
|
|
|
|
|
if not ref_text or not ref_text.strip(): |
|
|
return None, None, "❌ You must write the transcription of the reference audio" |
|
|
|
|
|
if not gen_text or not gen_text.strip(): |
|
|
return None, None, "❌ You must write the text to generate" |
|
|
|
|
|
|
|
|
if not model_loaded: |
|
|
success = load_models() |
|
|
if not success: |
|
|
return None, None, "❌ Error loading models" |
|
|
|
|
|
try: |
|
|
print("🔬 Generating with intermediate step capture...") |
|
|
|
|
|
|
|
|
ref_audio_processed, ref_text_processed = preprocess_ref_audio_text( |
|
|
reference_audio, |
|
|
ref_text |
|
|
) |
|
|
|
|
|
|
|
|
audio, sr = torchaudio.load(ref_audio_processed) |
|
|
if audio.shape[0] > 1: |
|
|
audio = torch.mean(audio, dim=0, keepdim=True) |
|
|
|
|
|
|
|
|
if sr != 24000: |
|
|
resampler = torchaudio.transforms.Resample(sr, 24000) |
|
|
audio = resampler(audio) |
|
|
|
|
|
audio = audio.to("cpu") |
|
|
|
|
|
|
|
|
text_list = [ref_text_processed + gen_text] |
|
|
final_text_list = convert_char_to_pinyin(text_list) |
|
|
|
|
|
|
|
|
ref_audio_len = audio.shape[-1] // 256 |
|
|
ref_text_len = len(ref_text_processed.encode("utf-8")) |
|
|
gen_text_len = len(gen_text.encode("utf-8")) |
|
|
duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len) |
|
|
|
|
|
|
|
|
print("Calling model.sample() with trajectory capture...") |
|
|
with torch.inference_mode(): |
|
|
generated_mel, trajectory = model.sample( |
|
|
cond=audio, |
|
|
text=final_text_list, |
|
|
duration=duration, |
|
|
steps=32, |
|
|
cfg_strength=2.0, |
|
|
sway_sampling_coef=-1.0, |
|
|
) |
|
|
|
|
|
print(f"Trajectory captured - Shape: {trajectory.shape}") |
|
|
|
|
|
|
|
|
steps_to_extract = [0, 12, 20, 26, 32] |
|
|
step_audios = [] |
|
|
|
|
|
for step_idx in steps_to_extract: |
|
|
print(f"Processing step {step_idx}/32...") |
|
|
mel_at_step = trajectory[step_idx] |
|
|
|
|
|
|
|
|
mel_generated = mel_at_step[:, ref_audio_len:, :] |
|
|
mel_generated = mel_generated.permute(0, 2, 1) |
|
|
|
|
|
|
|
|
audio_at_step = vocoder.decode(mel_generated) |
|
|
audio_np = audio_at_step.squeeze().cpu().numpy() |
|
|
|
|
|
step_audios.append((24000, audio_np)) |
|
|
|
|
|
|
|
|
final_audio = step_audios[-1] |
|
|
|
|
|
print("✅ Generation with steps completed") |
|
|
|
|
|
|
|
|
return final_audio, step_audios, f"✅ Generated with capture of {len(steps_to_extract)} intermediate steps" |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error in generation with steps: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return None, None, f"❌ Error: {str(e)}" |
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
with gr.Blocks( |
|
|
title="F5-TTS Voice Cloning", |
|
|
theme=gr.themes.Soft() |
|
|
) as demo: |
|
|
|
|
|
gr.Markdown("# 🎤 F5-TTS Voice Cloning and 🔬 Denoising Process Visualization") |
|
|
gr.Markdown("Clone any voice with just 5-30 seconds of reference audio and see how noise transforms into speech step by step.") |
|
|
gr.Markdown("Developed by Noel Triguero. Model by SWivid") |
|
|
gr.Markdown("---") |
|
|
|
|
|
gr.Markdown(""" |
|
|
## |
|
|
See how the model transforms pure noise into clean audio step by step. |
|
|
The F5-TTS model uses 32 "denoising" steps to generate the final audio. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### Input") |
|
|
|
|
|
ref_audio_steps = gr.Audio( |
|
|
label="Reference Audio", |
|
|
type="filepath", |
|
|
sources=["upload", "microphone"] |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
ref_text_steps = gr.Textbox( |
|
|
label="Transcription", |
|
|
lines=2, |
|
|
scale=1 |
|
|
) |
|
|
|
|
|
gen_text_steps = gr.Textbox( |
|
|
label="Text to Generate", |
|
|
lines=3, |
|
|
scale=1 |
|
|
) |
|
|
with gr.Row(): |
|
|
generate_steps_btn = gr.Button( |
|
|
"🔬 Generate with Step Capture", |
|
|
variant="primary" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
status_steps = gr.Textbox(label="Status", interactive=False) |
|
|
|
|
|
gr.Markdown("### Intermediate Denoising Steps") |
|
|
|
|
|
with gr.Row(): |
|
|
step_slider = gr.Slider( |
|
|
minimum=0, |
|
|
maximum=4, |
|
|
value=4, |
|
|
step=1, |
|
|
label="Select Step", |
|
|
info="0=Initial noise, 1=Step 12, 2=Step 20, 3=Step 26, 4=Step 32 (final)\n (First 10 steps are noise for humans)" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
step_audio = gr.Audio( |
|
|
label="Audio at Selected Step", |
|
|
type="numpy" |
|
|
) |
|
|
|
|
|
|
|
|
all_steps_state = gr.State(value=None) |
|
|
|
|
|
def update_step_audio(step_index, all_steps): |
|
|
if all_steps is None: |
|
|
return None |
|
|
return all_steps[int(step_index)] |
|
|
|
|
|
|
|
|
def process_with_steps(ref_audio, ref_text, gen_text): |
|
|
final, steps, status = generate_voice_with_steps( |
|
|
ref_audio, ref_text, gen_text |
|
|
) |
|
|
|
|
|
if steps: |
|
|
return steps, steps[-1], status |
|
|
else: |
|
|
return None, None, status |
|
|
|
|
|
generate_steps_btn.click( |
|
|
fn=process_with_steps, |
|
|
inputs=[ref_audio_steps, ref_text_steps, gen_text_steps], |
|
|
outputs=[all_steps_state, step_audio, status_steps] |
|
|
) |
|
|
|
|
|
step_slider.change( |
|
|
fn=update_step_audio, |
|
|
inputs=[step_slider, all_steps_state], |
|
|
outputs=[step_audio] |
|
|
) |
|
|
|
|
|
gr.Markdown("<br>") |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
## 💡 Tips for Better Results |
|
|
|
|
|
**Clean audio:** No background noise, music or echo |
|
|
**Duration:** 5-30 seconds is ideal |
|
|
**Exact transcription:** The transcription must match the audio exactly |
|
|
**Clear speech:** Constant volume and clear pronunciation |
|
|
**Language:** Reference audio and text should be in english or chinese |
|
|
|
|
|
--- |
|
|
## 🔧 Technical Information |
|
|
|
|
|
**Model:** F5-TTS (Flow Matching Text-to-Speech) |
|
|
**Vocoder:** Vocos |
|
|
**Device:** CPU (may take a while...) |
|
|
|
|
|
--- |
|
|
""") |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
print("🚀 Starting F5-TTS Voice Cloning App") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = create_interface() |
|
|
demo.launch() |