Spaces:

notrito
/

voice-clone-models-comparison

Sleeping

App Files Files Community

voice-clone-models-comparison / app.py

notrito

bug

7da6710 4 months ago

raw

history blame contribute delete

12.6 kB

	import gradio as gr
	import time
	import os
	from pathlib import Path
	import json
	from cached_path import cached_path
	from f5_tts.infer.utils_infer import load_model, load_vocoder
	from f5_tts.model import DiT
	from f5_tts.infer.utils_infer import infer_process, preprocess_ref_audio_text
	import torch
	import torchaudio
	from f5_tts.infer.utils_infer import preprocess_ref_audio_text, convert_char_to_pinyin


	# Configuración
	MODEL_NAME = "F5-TTS"
	SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "zh"]
	MAX_AUDIO_SIZE = 10 * 1024 * 1024 # 10MB

	# Variables globales para el modelo (se cargan una vez)
	model = None
	vocoder = None
	model_loaded = False

	def load_models():
	"""Load F5-TTS and vocoder (only once at startup)"""
	global model, vocoder, model_loaded

	if model_loaded:
	return True

	try:
	print("⏳ Loading F5-TTS and vocoder...")
	print("=" * 50)

	# Load vocoder first
	print("🔥 Loading Vocos vocoder...")
	vocoder = load_vocoder(
	vocoder_name="vocos",
	is_local=False,
	device="cpu"
	)
	print("✅ Vocoder loaded successfully")

	# Model configuration (copied from official code)
	print("\n🔥 Loading F5-TTS v1 Base model...")

	ckpt_path = str(cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors"))
	model_cfg = dict(
	dim=1024,
	depth=22,
	heads=16,
	ff_mult=2,
	text_dim=512,
	conv_layers=4
	)

	# Load model using the same function as the official code
	model = load_model(
	DiT,
	model_cfg,
	ckpt_path
	)
	print("✅ F5-TTS model loaded successfully")

	model_loaded = True
	print("\n" + "=" * 50)
	print("✅ All models loaded successfully")
	return True

	except Exception as e:
	print(f"\n❌ CRITICAL ERROR loading models:")
	print(f" Type: {type(e).__name__}")
	print(f" Message: {str(e)}")
	import traceback
	print("\nFull stack trace:")
	traceback.print_exc()
	print("=" * 50)
	return False

	def validate_audio(audio_file):
	"""Validate audio file"""
	if audio_file is None:
	return False, "Please upload an audio file"

	try:
	file_size = os.path.getsize(audio_file)
	if file_size > MAX_AUDIO_SIZE:
	return False, f"File too large. Maximum 10MB"
	return True, "Valid audio"
	except Exception as e:
	return False, f"Error validating audio: {e}"

	# def generate_voice(reference_audio, ref_text, gen_text):
	# """Generate voice with F5-TTS"""

	# # Validate input
	# is_valid, msg = validate_audio(reference_audio)
	# if not is_valid:
	# return None, f"❌ {msg}", ""

	# if not ref_text or not ref_text.strip():
	# return None, "❌ You must write the transcription of the reference audio", ""

	# if not gen_text or not gen_text.strip():
	# return None, "❌ You must write the text to generate", ""

	# # Check that models are loaded
	# if not model_loaded:
	# success = load_models()
	# if not success:
	# return None, "❌ Error loading models. Try reloading the page.", ""

	# try:
	# start_time = time.time()

	# print(f"🎤 Generating audio...")
	# print(f" Ref text: {ref_text[:50]}...")
	# print(f" Gen text: {gen_text[:50]}...")

	# # Preprocess reference audio
	# ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
	# reference_audio,
	# ref_text
	# )

	# # Process with F5-TTS (same as official code)
	# final_wave, final_sample_rate, combined_spectrogram = infer_process(
	# ref_audio=ref_audio_processed,
	# ref_text=ref_text_processed,
	# gen_text=gen_text,
	# model_obj=model,
	# vocoder=vocoder,
	# device="cpu"
	# )
	# end_time = time.time()
	# processing_time = end_time - start_time

	# # result should be the generated audio
	# output_path = "generated_audio.wav"

	# success_msg = f"✅ Audio generated successfully"
	# time_msg = f"⏱️ Time: {processing_time:.2f}s"

	# return (final_sample_rate, final_wave), success_msg, time_msg

	# except Exception as e:
	# print(f"❌ Error in generation: {e}")
	# import traceback
	# traceback.print_exc()
	# return None, f"❌ Error: {str(e)}", ""

	def generate_voice_with_steps(reference_audio, ref_text, gen_text):
	"""Generate voice capturing intermediate denoising steps"""

	# Validate input
	is_valid, msg = validate_audio(reference_audio)
	if not is_valid:
	return None, None, f"❌ {msg}"

	if not ref_text or not ref_text.strip():
	return None, None, "❌ You must write the transcription of the reference audio"

	if not gen_text or not gen_text.strip():
	return None, None, "❌ You must write the text to generate"

	# Check that models are loaded
	if not model_loaded:
	success = load_models()
	if not success:
	return None, None, "❌ Error loading models"

	try:
	print("🔬 Generating with intermediate step capture...")

	# Preprocess
	ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
	reference_audio,
	ref_text
	)

	# Load and process audio
	audio, sr = torchaudio.load(ref_audio_processed)
	if audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True)

	# Resample if necessary
	if sr != 24000:
	resampler = torchaudio.transforms.Resample(sr, 24000)
	audio = resampler(audio)

	audio = audio.to("cpu")

	# Prepare text
	text_list = [ref_text_processed + gen_text]
	final_text_list = convert_char_to_pinyin(text_list)

	# Calculate duration
	ref_audio_len = audio.shape[-1] // 256 # hop_length
	ref_text_len = len(ref_text_processed.encode("utf-8"))
	gen_text_len = len(gen_text.encode("utf-8"))
	duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len)

	# Generate WITH trajectory
	print("Calling model.sample() with trajectory capture...")
	with torch.inference_mode():
	generated_mel, trajectory = model.sample(
	cond=audio,
	text=final_text_list,
	duration=duration,
	steps=32,
	cfg_strength=2.0,
	sway_sampling_coef=-1.0,
	)

	print(f"Trajectory captured - Shape: {trajectory.shape}")

	# Extract specific steps to display
	steps_to_extract = [0, 12, 20, 26, 32]
	step_audios = []

	for step_idx in steps_to_extract:
	print(f"Processing step {step_idx}/32...")
	mel_at_step = trajectory[step_idx]

	# Crop reference part and permute
	mel_generated = mel_at_step[:, ref_audio_len:, :]
	mel_generated = mel_generated.permute(0, 2, 1)

	# Convert to audio with vocoder
	audio_at_step = vocoder.decode(mel_generated)
	audio_np = audio_at_step.squeeze().cpu().numpy()

	step_audios.append((24000, audio_np))

	# The last step is the final audio
	final_audio = step_audios[-1]

	print("✅ Generation with steps completed")

	# Return: final audio, list of steps, message
	return final_audio, step_audios, f"✅ Generated with capture of {len(steps_to_extract)} intermediate steps"

	except Exception as e:
	print(f"❌ Error in generation with steps: {e}")
	import traceback
	traceback.print_exc()
	return None, None, f"❌ Error: {str(e)}"
	# Crear interfaz Gradio

	def create_interface():
	with gr.Blocks(
	title="F5-TTS Voice Cloning",
	theme=gr.themes.Soft()
	) as demo:

	gr.Markdown("# 🎤 F5-TTS Voice Cloning and 🔬 Denoising Process Visualization")
	gr.Markdown("Clone any voice with just 5-30 seconds of reference audio and see how noise transforms into speech step by step.")
	gr.Markdown("Developed by Noel Triguero. Model by SWivid")
	gr.Markdown("---")

	gr.Markdown("""
	##
	See how the model transforms pure noise into clean audio step by step.
	The F5-TTS model uses 32 "denoising" steps to generate the final audio.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Input")

	ref_audio_steps = gr.Audio(
	label="Reference Audio",
	type="filepath",
	sources=["upload", "microphone"]
	)

	with gr.Row():
	ref_text_steps = gr.Textbox(
	label="Transcription",
	lines=2,
	scale=1
	)

	gen_text_steps = gr.Textbox(
	label="Text to Generate",
	lines=3,
	scale=1
	)
	with gr.Row():
	generate_steps_btn = gr.Button(
	"🔬 Generate with Step Capture",
	variant="primary"
	)

	with gr.Row():
	status_steps = gr.Textbox(label="Status", interactive=False)

	gr.Markdown("### Intermediate Denoising Steps")

	with gr.Row():
	step_slider = gr.Slider(
	minimum=0,
	maximum=4,
	value=4,
	step=1,
	label="Select Step",
	info="0=Initial noise, 1=Step 12, 2=Step 20, 3=Step 26, 4=Step 32 (final)\n (First 10 steps are noise for humans)"
	)

	with gr.Row():
	step_audio = gr.Audio(
	label="Audio at Selected Step",
	type="numpy"
	)

	# Hiden state to store all steps
	all_steps_state = gr.State(value=None)

	def update_step_audio(step_index, all_steps):
	if all_steps is None:
	return None
	return all_steps[int(step_index)]

	# Generate with steps and store all steps in state
	def process_with_steps(ref_audio, ref_text, gen_text):
	final, steps, status = generate_voice_with_steps(
	ref_audio, ref_text, gen_text
	)
	# Only return the last step audio for the slider
	if steps:
	return steps, steps[-1], status
	else:
	return None, None, status

	generate_steps_btn.click(
	fn=process_with_steps,
	inputs=[ref_audio_steps, ref_text_steps, gen_text_steps],
	outputs=[all_steps_state, step_audio, status_steps]
	)

	step_slider.change(
	fn=update_step_audio,
	inputs=[step_slider, all_steps_state],
	outputs=[step_audio]
	)

	gr.Markdown("<br>") # Espacio arriba

	gr.Markdown("""
	---
	## 💡 Tips for Better Results

	Clean audio: No background noise, music or echo
	Duration: 5-30 seconds is ideal
	Exact transcription: The transcription must match the audio exactly
	Clear speech: Constant volume and clear pronunciation
	Language: Reference audio and text should be in english or chinese

	---
	## 🔧 Technical Information

	Model: F5-TTS (Flow Matching Text-to-Speech)
	Vocoder: Vocos
	Device: CPU (may take a while...)

	---
	""")

	return demo

	if __name__ == "__main__":
	# Pre-load models at startup (optional, improves first experience)
	print("🚀 Starting F5-TTS Voice Cloning App")
	print("=" * 50)

	# Comment the following line if you want on-demand loading
	# load_models()

	demo = create_interface()
	demo.launch()