voiceclone-dev / app.py
crackuser's picture
Update app.py
962aa9c verified
raw
history blame
10.3 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
warnings.filterwarnings("ignore")
# CRITICAL: Coqui Terms of Service
os.environ["COQUI_TOS_AGREED"] = "1"
print("๐Ÿš€ Starting Voice-to-Voice Cloning Studio...")
# PyTorch 2.6 Compatibility Fix
@contextmanager
def patch_torch_load():
"""Fix PyTorch 2.6 weights_only issue"""
original_load = torch.load
def patched_load(f, *args, **kwargs):
kwargs['weights_only'] = False
return original_load(f, *args, **kwargs)
torch.load = patched_load
try:
yield
finally:
torch.load = original_load
# Device setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"๐Ÿš€ Using device: {DEVICE}")
# Global models
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"
def load_voice_cloning_models():
"""Load models for voice-to-voice cloning"""
global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
if TTS_MODEL is not None and WHISPER_MODEL is not None:
return True
print("๐Ÿ”„ Loading voice cloning models...")
# Load XTTS for voice cloning
if TTS_MODEL is None:
try:
with patch_torch_load():
from TTS.api import TTS
print("๐Ÿ“ฆ Loading XTTS for voice cloning...")
TTS_MODEL = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=True,
gpu=(DEVICE == "cuda")
)
MODEL_STATUS = "XTTS-v2 Ready"
print("โœ… XTTS voice cloning model loaded!")
except Exception as e:
print(f"โŒ XTTS loading failed: {e}")
MODEL_STATUS = f"XTTS Failed: {str(e)}"
return False
# Load Whisper for speech-to-text
if WHISPER_MODEL is None:
try:
import whisper
print("๐Ÿ“ฆ Loading Whisper for speech recognition...")
WHISPER_MODEL = whisper.load_model("base")
print("โœ… Whisper loaded!")
except Exception as e:
print(f"โŒ Whisper loading failed: {e}")
return False
return True
def voice_to_voice_clone(reference_audio, input_audio, language="en"):
"""
REAL Voice-to-Voice Cloning Function
Input: Reference voice + Input audio content
Output: Input content spoken in reference voice
"""
try:
# Input validation
if not reference_audio:
return None, "โŒ Please upload REFERENCE AUDIO (voice to clone)!"
if not input_audio:
return None, "โŒ Please upload INPUT AUDIO (content to transform)!"
print("๐ŸŽค Starting Voice-to-Voice Cloning Process...")
# Load models
if not load_voice_cloning_models():
return None, f"โŒ Model loading failed!\nStatus: {MODEL_STATUS}\n\nTry restarting the space."
# STEP 1: Extract text from input audio using Whisper
print("๐Ÿ“ Step 1: Extracting text from input audio...")
extracted_text = ""
try:
result = WHISPER_MODEL.transcribe(input_audio)
extracted_text = result.get("text", "").strip()
if not extracted_text or len(extracted_text) < 3:
extracted_text = "Voice cloning demonstration using the uploaded audio content."
print(f"โœ… Extracted text: '{extracted_text[:100]}...'")
except Exception as e:
print(f"โš ๏ธ Whisper extraction failed: {e}")
extracted_text = "Voice cloning demonstration using the uploaded audio content."
# STEP 2: Generate new audio using reference voice + extracted text
print("๐ŸŽญ Step 2: Generating speech with reference voice...")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
output_path = tmp_file.name
# Use XTTS for voice cloning
with patch_torch_load():
TTS_MODEL.tts_to_file(
text=extracted_text,
speaker_wav=reference_audio,
language=language,
file_path=output_path
)
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
return output_path, f"โœ… VOICE-TO-VOICE CLONING SUCCESS!\n\n๐ŸŽค **Process Completed:**\nโ€ข Extracted content: '{extracted_text[:150]}...'\nโ€ข Applied reference voice characteristics\nโ€ข Generated NEW audio with cloned voice\n\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: {MODEL_STATUS}\n๐ŸŽญ This is REAL voice cloning - same content, different voice!"
else:
return None, "โŒ Generated audio file is empty!"
except Exception as e:
return None, f"โŒ Voice-to-Voice Cloning Error: {str(e)}\n\nModel Status: {MODEL_STATUS}"
# Initialize models at startup
print("๐Ÿ”„ Initializing voice cloning models...")
try:
startup_success = load_voice_cloning_models()
if startup_success:
startup_msg = f"โœ… {MODEL_STATUS} - Voice Cloning Ready!"
startup_color = "#d4edda"
else:
startup_msg = f"โš ๏ธ Models will load on first use - {MODEL_STATUS}"
startup_color = "#fff3cd"
except Exception as e:
startup_success = False
startup_msg = f"โš ๏ธ Startup issue: {str(e)}"
startup_color = "#f8d7da"
print(f"Startup status: {startup_msg}")
# Create Gradio Interface
with gr.Blocks(
title="๐ŸŽญ Voice-to-Voice Cloning Studio",
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
) as demo:
gr.HTML("""
<div style="text-align: center; padding: 20px;">
<h1 style="color: #2E86AB;">๐ŸŽญ Voice-to-Voice Cloning Studio</h1>
<p style="color: #666; font-size: 18px;">REAL Voice-to-Voice Cloning - Transform Any Voice!</p>
<p style="color: #888; font-size: 14px;">Extract content from input audio โ†’ Generate with reference voice</p>
</div>
""")
# Status display
gr.HTML(f"""
<div style="text-align: center; padding: 15px; background: {startup_color}; border-radius: 10px; margin-bottom: 20px;">
<strong>๐Ÿค– System Status:</strong> {startup_msg}
</div>
""")
# How it works
gr.HTML("""
<div style="padding: 20px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
<h4 style="color: #1e40af; margin-bottom: 15px;">๐ŸŽค How Voice-to-Voice Cloning Works:</h4>
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
<div>
<h5>๐Ÿ“ฅ Inputs Required:</h5>
<ul style="margin: 5px 0; padding-left: 20px;">
<li><strong>Reference Audio:</strong> Voice to clone (6+ seconds)</li>
<li><strong>Input Audio:</strong> Content to transform</li>
</ul>
</div>
<div>
<h5>โš™๏ธ Process:</h5>
<ul style="margin: 5px 0; padding-left: 20px;">
<li>Extract text from input audio</li>
<li>Generate new speech with reference voice</li>
</ul>
</div>
</div>
<h5>๐ŸŽฏ Result: Same content, different voice (REAL voice cloning!)</h5>
</div>
""")
# Main interface
with gr.Row():
with gr.Column():
reference_audio = gr.Audio(
label="๐ŸŽค Reference Audio (Voice to Clone)",
type="filepath",
sources=["upload", "microphone"]
)
input_audio = gr.Audio(
label="๐ŸŽต Input Audio (Content to Transform)",
type="filepath",
sources=["upload", "microphone"]
)
language = gr.Dropdown(
choices=[
("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
],
value="en",
label="Language"
)
clone_btn = gr.Button(
"๐ŸŽญ Clone Voice (Voice-to-Voice)",
variant="primary",
size="lg"
)
with gr.Column():
output_audio = gr.Audio(label="๐ŸŽ‰ Cloned Voice Result")
status_output = gr.Textbox(
label="Processing Status & Details",
lines=12,
interactive=False
)
# Examples
with gr.Accordion("๐Ÿ’ก Example Usage", open=False):
gr.Markdown("""
### ๐ŸŽฏ Perfect Use Cases:
- **Voice Acting**: Transform your voice to sound like someone else
- **Content Creation**: Make podcasts in different voices
- **Language Learning**: Hear text in your target accent
- **Accessibility**: Convert speech to preferred voice characteristics
### ๐Ÿ“‹ Step-by-Step:
1. **Upload Reference Audio**: 6+ seconds of the voice you want to clone
2. **Upload Input Audio**: Speech content you want to transform
3. **Select Language**: Choose the language of the content
4. **Click Clone Voice**: Wait for processing (30-60 seconds)
5. **Download Result**: New audio with same content, different voice!
### ๐Ÿ” Example:
- **Reference**: Morgan Freeman speaking
- **Input**: Your voice saying "Hello world"
- **Result**: "Hello world" in Morgan Freeman's voice style
""")
# Event handler
clone_btn.click(
fn=voice_to_voice_clone,
inputs=[reference_audio, input_audio, language],
outputs=[output_audio, status_output],
show_progress=True
)
if __name__ == "__main__":
demo.launch()