voiceclone-dev / app.py
crackuser's picture
Update app.py
71d678c verified
raw
history blame
5.65 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
warnings.filterwarnings("ignore")
os.environ["COQUI_TOS_AGREED"] = "1"
print("πŸš€ Starting Voice Cloning Studio...")
@contextmanager
def patch_torch_load():
original_load = torch.load
def patched_load(f, *args, **kwargs):
kwargs['weights_only'] = False
return original_load(f, *args, **kwargs)
torch.load = patched_load
try:
yield
finally:
torch.load = original_load
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"
def load_xtts_manual():
global TTS_MODEL, MODEL_STATUS
if TTS_MODEL is not None:
return True
try:
with patch_torch_load():
from TTS.api import TTS
print("πŸ“¦ Loading XTTS...")
TTS_MODEL = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=True,
gpu=(DEVICE == "cuda")
)
MODEL_STATUS = "XTTS-v2 Ready"
print("βœ… XTTS loaded!")
return True
except Exception as e:
print(f"❌ XTTS loading failed: {e}")
MODEL_STATUS = f"Manual Failed: {str(e)}"
return False
def load_whisper():
global WHISPER_MODEL
if WHISPER_MODEL is not None:
return True
try:
import whisper
WHISPER_MODEL = whisper.load_model("base")
print("βœ… Whisper loaded!")
return True
except Exception as e:
print(f"❌ Whisper failed: {e}")
return False
def voice_to_voice_clone(reference_audio, input_audio, language="en"):
"""
Main voice cloning function - this will be called by both UI and API
"""
try:
print(f"🎭 Voice cloning request: {language}")
print(f"πŸ“ Reference: {reference_audio}")
print(f"πŸ“ Input: {input_audio}")
if not reference_audio or not input_audio:
return None, "❌ Please upload both reference and input audio files!"
# Load XTTS model
if not load_xtts_manual():
return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
# Load Whisper for transcription
load_whisper()
# Extract text from input audio
extracted_text = "Voice cloning demonstration."
if WHISPER_MODEL:
try:
result = WHISPER_MODEL.transcribe(input_audio)
text = result.get("text", "").strip()
if text and len(text) > 3:
extracted_text = text
print(f"βœ… Extracted: '{extracted_text[:100]}...'")
except Exception as e:
print(f"⚠️ Whisper error: {e}")
# Generate cloned voice
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
print(f"πŸ”„ Generating voice clone...")
with patch_torch_load():
TTS_MODEL.tts_to_file(
text=extracted_text,
speaker_wav=reference_audio,
language=language,
file_path=output_path
)
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
success_message = f"""βœ… VOICE-TO-VOICE CLONING SUCCESS!
πŸ“ Content: '{extracted_text[:150]}...'
🎭 Device: {DEVICE}
πŸ”§ Status: {MODEL_STATUS}
πŸ“Š Output size: {os.path.getsize(output_path)} bytes
"""
print("βœ… Voice cloning completed successfully!")
return output_path, success_message
else:
return None, "❌ Generated audio file is empty!"
except Exception as e:
error_msg = f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
print(error_msg)
return None, error_msg
# FIXED: Use gr.Interface instead of gr.Blocks for proper API exposure
interface = gr.Interface(
fn=voice_to_voice_clone,
inputs=[
gr.Audio(
label="🎀 Reference Audio (Voice to Clone)",
type="filepath",
sources=["upload"]
),
gr.Audio(
label="🎡 Input Audio (Content to Transform)",
type="filepath",
sources=["upload"]
),
gr.Dropdown(
choices=[
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
"cs", "ar", "zh", "ja", "ko", "hi", "uk", "vi", "ro", "el",
"he", "fi", "hu", "sv", "ca", "id", "ms", "bg", "sk", "da",
"no", "lt", "hr", "sr", "sl", "et", "lv", "fil", "bn", "ta",
"te", "ur", "fa", "th"
],
value="en",
label="🌍 Language"
)
],
outputs=[
gr.Audio(label="πŸŽ‰ Cloned Voice Result"),
gr.Textbox(label="πŸ“‹ Status", lines=8)
],
title="🎭 REAL Voice Cloning Studio",
description="Transform any voice into any other voice using XTTS-v2 and Whisper AI models. Upload reference audio and input audio to get started.",
theme=gr.themes.Soft(),
allow_flagging="never",
api_name="voice_to_voice_clone" # CRITICAL: This creates the API endpoint
)
if __name__ == "__main__":
print("🌐 Launching Voice Cloning Studio...")
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_api=True, # Shows API documentation
debug=True
)