voiceclone-dev / app.py
crackuser's picture
Update app.py
3e9e2ab verified
raw
history blame
7.31 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
import gc
warnings.filterwarnings("ignore")
os.environ["COQUI_TOS_AGREED"] = "1"
print("πŸš€ Starting OPTIMIZED Voice Cloning Studio...")
@contextmanager
def patch_torch_load():
original_load = torch.load
def patched_load(f, *args, **kwargs):
kwargs['weights_only'] = False
return original_load(f, *args, **kwargs)
torch.load = patched_load
try:
yield
finally:
torch.load = original_load
# OPTIMIZATION 1: Hardware Detection and Setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if DEVICE == "cuda":
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
print(f"πŸ”₯ Device: {DEVICE}")
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"
SPEAKER_EMBEDDINGS_CACHE = {}
def load_xtts_optimized():
global TTS_MODEL, MODEL_STATUS
if TTS_MODEL is not None:
return True
try:
with patch_torch_load():
from TTS.api import TTS
print("πŸ“¦ Loading XTTS with optimizations...")
TTS_MODEL = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=False,
gpu=(DEVICE == "cuda")
)
MODEL_STATUS = "XTTS-v2 Optimized"
print("βœ… XTTS loaded with optimizations!")
return True
except Exception as e:
print(f"❌ XTTS loading failed: {e}")
MODEL_STATUS = f"Failed: {str(e)}"
return False
def load_whisper_optimized():
global WHISPER_MODEL
if WHISPER_MODEL is not None:
return True
try:
import whisper
WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
print("βœ… Whisper loaded (base model for speed)!")
return True
except Exception as e:
print(f"❌ Whisper failed: {e}")
return False
def optimize_audio_input(audio_path, max_duration=15):
"""Limit audio length for faster processing"""
try:
import librosa
import soundfile as sf
audio, sr = librosa.load(audio_path, sr=22050)
# Limit duration for speed
max_samples = int(max_duration * sr)
if len(audio) > max_samples:
audio = audio[:max_samples]
print(f"πŸ”„ Audio trimmed to {max_duration}s for speed")
# Save optimized audio
optimized_path = audio_path.replace('.wav', '_opt.wav')
sf.write(optimized_path, audio, sr)
return optimized_path
except Exception as e:
print(f"⚠️ Audio optimization failed: {e}")
return audio_path
def voice_to_voice_clone_optimized(reference_audio, input_audio, language="en"):
"""OPTIMIZED voice cloning with performance improvements"""
try:
print(f"🎭 OPTIMIZED Voice cloning: {language}")
if not reference_audio or not input_audio:
return None, "❌ Upload both audio files!"
# Load models
if not load_xtts_optimized():
return None, f"❌ XTTS failed: {MODEL_STATUS}"
load_whisper_optimized()
# Optimize input audios for speed
ref_optimized = optimize_audio_input(reference_audio, max_duration=15)
input_optimized = optimize_audio_input(input_audio, max_duration=20)
# Fast transcription with limits
extracted_text = "Voice cloning demonstration."
if WHISPER_MODEL:
try:
with torch.no_grad():
result = WHISPER_MODEL.transcribe(
input_optimized,
fp16=(DEVICE == "cuda"),
language=language if language != 'auto' else None
)
text = result.get("text", "").strip()[:300] # Limit text length
if text and len(text) > 10:
extracted_text = text
print(f"βœ… Extracted: '{extracted_text[:50]}...'")
except Exception as e:
print(f"⚠️ Transcription error: {e}")
# Generate output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
print("πŸš€ Generating optimized voice clone...")
with patch_torch_load(), torch.no_grad():
TTS_MODEL.tts_to_file(
text=extracted_text,
speaker_wav=ref_optimized,
language=language,
file_path=output_path,
temperature=0.7,
length_penalty=1.0,
repetition_penalty=5.0
)
# Memory cleanup
if DEVICE == "cuda":
torch.cuda.empty_cache()
gc.collect()
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
success_msg = f"""βœ… OPTIMIZED CLONING SUCCESS! ⚑
πŸ“ Text: '{extracted_text[:100]}...'
🎭 Device: {DEVICE}
πŸ”§ Status: {MODEL_STATUS}
πŸ“Š Size: {os.path.getsize(output_path)/1024:.1f} KB
πŸš€ Optimizations: Limited audio, FP16, Memory cleanup"""
print("βœ… Optimized voice cloning completed!")
return output_path, success_msg
else:
return None, "❌ Output file empty!"
except Exception as e:
error_msg = f"❌ Optimized cloning error: {str(e)}"
print(error_msg)
return None, error_msg
# Create Gradio interface
interface = gr.Interface(
fn=voice_to_voice_clone_optimized,
inputs=[
gr.Audio(
label="🎀 Reference Audio (Voice to Clone - Max 15s recommended)",
type="filepath",
sources=["upload"]
),
gr.Audio(
label="🎡 Input Audio (Content - Max 20s for speed)",
type="filepath",
sources=["upload"]
),
gr.Dropdown(
choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
value="en",
label="🌍 Language"
)
],
outputs=[
gr.Audio(label="πŸŽ‰ Optimized Cloned Voice"),
gr.Textbox(label="πŸ“Š Performance Stats", lines=8)
],
title="πŸš€ HIGH-SPEED Voice Cloning Studio",
description="⚑ Optimized XTTS-v2 with performance tuning. Use 10-20 second audio clips for fastest results (30-120 seconds processing time)!",
theme=gr.themes.Soft(),
allow_flagging="never",
api_name="voice_to_voice_clone"
)
if __name__ == "__main__":
print("🌐 Launching OPTIMIZED Voice Cloning Studio...")
# FIXED: Correct queue configuration
interface.queue(
max_size=5, # Limit queue size to prevent overload
api_open=True, # Allow API access
default_concurrency_limit=1 # Process one request at a time for stability
).launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_api=True,
debug=False # Disable debug for speed
# REMOVED: enable_queue=True (this was causing the error)
)