Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

voiceclone-dev / app.py

crackuser

Update app.py

3e9e2ab verified 8 months ago

raw

history blame

7.31 kB

	import gradio as gr
	import torch
	import torchaudio
	import tempfile
	import os
	import warnings
	from contextlib import contextmanager
	import gc

	warnings.filterwarnings("ignore")
	os.environ["COQUI_TOS_AGREED"] = "1"
	print("🚀 Starting OPTIMIZED Voice Cloning Studio...")

	@contextmanager
	def patch_torch_load():
	original_load = torch.load
	def patched_load(f, args, *kwargs):
	kwargs['weights_only'] = False
	return original_load(f, args, *kwargs)
	torch.load = patched_load
	try:
	yield
	finally:
	torch.load = original_load

	# OPTIMIZATION 1: Hardware Detection and Setup
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	if DEVICE == "cuda":
	torch.backends.cudnn.benchmark = True
	torch.backends.cuda.matmul.allow_tf32 = True

	print(f"🔥 Device: {DEVICE}")

	TTS_MODEL = None
	WHISPER_MODEL = None
	MODEL_STATUS = "Not Loaded"
	SPEAKER_EMBEDDINGS_CACHE = {}

	def load_xtts_optimized():
	global TTS_MODEL, MODEL_STATUS
	if TTS_MODEL is not None:
	return True
	try:
	with patch_torch_load():
	from TTS.api import TTS
	print("📦 Loading XTTS with optimizations...")

	TTS_MODEL = TTS(
	model_name="tts_models/multilingual/multi-dataset/xtts_v2",
	progress_bar=False,
	gpu=(DEVICE == "cuda")
	)

	MODEL_STATUS = "XTTS-v2 Optimized"
	print("✅ XTTS loaded with optimizations!")
	return True
	except Exception as e:
	print(f"❌ XTTS loading failed: {e}")
	MODEL_STATUS = f"Failed: {str(e)}"
	return False

	def load_whisper_optimized():
	global WHISPER_MODEL
	if WHISPER_MODEL is not None:
	return True
	try:
	import whisper
	WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
	print("✅ Whisper loaded (base model for speed)!")
	return True
	except Exception as e:
	print(f"❌ Whisper failed: {e}")
	return False

	def optimize_audio_input(audio_path, max_duration=15):
	"""Limit audio length for faster processing"""
	try:
	import librosa
	import soundfile as sf

	audio, sr = librosa.load(audio_path, sr=22050)

	# Limit duration for speed
	max_samples = int(max_duration * sr)
	if len(audio) > max_samples:
	audio = audio[:max_samples]
	print(f"🔄 Audio trimmed to {max_duration}s for speed")

	# Save optimized audio
	optimized_path = audio_path.replace('.wav', '_opt.wav')
	sf.write(optimized_path, audio, sr)
	return optimized_path

	except Exception as e:
	print(f"⚠️ Audio optimization failed: {e}")
	return audio_path

	def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
	"""OPTIMIZED voice cloning with performance improvements"""
	try:
	print(f"🎭 OPTIMIZED Voice cloning: {language}")

	if not reference_audio or not input_audio:
	return None, "❌ Upload both audio files!"

	# Load models
	if not load_xtts_optimized():
	return None, f"❌ XTTS failed: {MODEL_STATUS}"
	load_whisper_optimized()

	# Optimize input audios for speed
	ref_optimized = optimize_audio_input(reference_audio, max_duration=15)
	input_optimized = optimize_audio_input(input_audio, max_duration=20)

	# Fast transcription with limits
	extracted_text = "Voice cloning demonstration."
	if WHISPER_MODEL:
	try:
	with torch.no_grad():
	result = WHISPER_MODEL.transcribe(
	input_optimized,
	fp16=(DEVICE == "cuda"),
	language=language if language != 'auto' else None
	)
	text = result.get("text", "").strip()[:300] # Limit text length
	if text and len(text) > 10:
	extracted_text = text
	print(f"✅ Extracted: '{extracted_text[:50]}...'")
	except Exception as e:
	print(f"⚠️ Transcription error: {e}")

	# Generate output
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	output_path = tmp_file.name

	print("🚀 Generating optimized voice clone...")

	with patch_torch_load(), torch.no_grad():
	TTS_MODEL.tts_to_file(
	text=extracted_text,
	speaker_wav=ref_optimized,
	language=language,
	file_path=output_path,
	temperature=0.7,
	length_penalty=1.0,
	repetition_penalty=5.0
	)

	# Memory cleanup
	if DEVICE == "cuda":
	torch.cuda.empty_cache()
	gc.collect()

	# Verify output
	if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
	success_msg = f"""✅ OPTIMIZED CLONING SUCCESS! ⚡
	📝 Text: '{extracted_text[:100]}...'
	🎭 Device: {DEVICE}
	🔧 Status: {MODEL_STATUS}
	📊 Size: {os.path.getsize(output_path)/1024:.1f} KB
	🚀 Optimizations: Limited audio, FP16, Memory cleanup"""

	print("✅ Optimized voice cloning completed!")
	return output_path, success_msg
	else:
	return None, "❌ Output file empty!"

	except Exception as e:
	error_msg = f"❌ Optimized cloning error: {str(e)}"
	print(error_msg)
	return None, error_msg

	# Create Gradio interface
	interface = gr.Interface(
	fn=voice_to_voice_clone_optimized,
	inputs=[
	gr.Audio(
	label="🎤 Reference Audio (Voice to Clone - Max 15s recommended)",
	type="filepath",
	sources=["upload"]
	),
	gr.Audio(
	label="🎵 Input Audio (Content - Max 20s for speed)",
	type="filepath",
	sources=["upload"]
	),
	gr.Dropdown(
	choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
	value="en",
	label="🌍 Language"
	)
	],
	outputs=[
	gr.Audio(label="🎉 Optimized Cloned Voice"),
	gr.Textbox(label="📊 Performance Stats", lines=8)
	],
	title="🚀 HIGH-SPEED Voice Cloning Studio",
	description="⚡ Optimized XTTS-v2 with performance tuning. Use 10-20 second audio clips for fastest results (30-120 seconds processing time)!",
	theme=gr.themes.Soft(),
	allow_flagging="never",
	api_name="voice_to_voice_clone"
	)

	if __name__ == "__main__":
	print("🌐 Launching OPTIMIZED Voice Cloning Studio...")

	# FIXED: Correct queue configuration
	interface.queue(
	max_size=5, # Limit queue size to prevent overload
	api_open=True, # Allow API access
	default_concurrency_limit=1 # Process one request at a time for stability
	).launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_api=True,
	debug=False # Disable debug for speed
	# REMOVED: enable_queue=True (this was causing the error)
	)