voiceclone-dev / app.py
crackuser's picture
Update app.py
75fb8ef verified
raw
history blame
4.84 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
warnings.filterwarnings("ignore")
os.environ["COQUI_TOS_AGREED"] = "1"
print("πŸš€ Starting Voice Cloning Studio...")
@contextmanager
def patch_torch_load():
original_load = torch.load
def patched_load(f, *args, **kwargs):
kwargs['weights_only'] = False
return original_load(f, *args, **kwargs)
torch.load = patched_load
try:
yield
finally:
torch.load = original_load
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"
def load_xtts_manual():
global TTS_MODEL, MODEL_STATUS
if TTS_MODEL is not None:
return True
try:
with patch_torch_load():
from TTS.api import TTS
print("πŸ“¦ Loading XTTS...")
TTS_MODEL = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=True,
gpu=(DEVICE == "cuda")
)
MODEL_STATUS = "XTTS-v2 Ready"
print("βœ… XTTS loaded!")
return True
except Exception as e:
print(f"❌ XTTS loading failed: {e}")
MODEL_STATUS = f"Manual Failed: {str(e)}"
return False
def load_whisper():
global WHISPER_MODEL
if WHISPER_MODEL is not None:
return True
try:
import whisper
WHISPER_MODEL = whisper.load_model("base")
print("βœ… Whisper loaded!")
return True
except Exception as e:
print(f"❌ Whisper failed: {e}")
return False
def voice_to_voice_clone(reference_audio, input_audio, language="en"):
try:
if not reference_audio or not input_audio:
return None, "❌ Please upload both reference and input audio files!"
if not load_xtts_manual():
return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
load_whisper()
extracted_text = "Voice cloning demonstration."
if WHISPER_MODEL:
try:
result = WHISPER_MODEL.transcribe(input_audio)
text = result.get("text", "").strip()
if text and len(text) > 3:
extracted_text = text
print(f"βœ… Extracted: '{extracted_text[:100]}...'")
except Exception as e:
print(f"⚠️ Whisper error: {e}")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
with patch_torch_load():
TTS_MODEL.tts_to_file(
text=extracted_text,
speaker_wav=reference_audio,
language=language,
file_path=output_path
)
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"""βœ… VOICE-TO-VOICE CLONING SUCCESS!
πŸ“ Content: '{extracted_text[:150]}...'
🎭 Device: {DEVICE}
πŸ”§ Status: {MODEL_STATUS}
"""
else:
return None, "❌ Generated audio file is empty!"
except Exception as e:
return None, f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
# Gradio Interface
with gr.Blocks(title="Voice Cloning Studio") as demo:
gr.HTML("""
<div style="text-align: center; padding: 25px;">
<h1>🎭 REAL Voice Cloning Studio</h1>
<p>Status: Models load on first use</p>
</div>
""")
with gr.Row():
with gr.Column():
reference_audio = gr.Audio(
label="🎀 Reference Audio (Voice to Clone)",
type="filepath",
sources=["upload", "microphone"]
)
input_audio = gr.Audio(
label="🎡 Input Audio (Content to Transform)",
type="filepath",
sources=["upload", "microphone"]
)
language = gr.Dropdown(
choices=[
("English", "en"),
("Spanish", "es"),
("French", "fr"),
("German", "de")
],
value="en",
label="Language"
)
clone_btn = gr.Button("Clone Voice", variant="primary", size="lg")
with gr.Column():
output_audio = gr.Audio(label="Cloned Voice Result")
status_output = gr.Textbox(
label="Status",
lines=12,
interactive=False
)
clone_btn.click(
fn=voice_to_voice_clone,
inputs=[reference_audio, input_audio, language],
outputs=[output_audio, status_output],
show_progress=True
)
if __name__ == "__main__":
demo.launch()