Spaces:

crackuser
/

voiceclone-dev

Sleeping

File size: 9,805 Bytes

f758d08
2c8d218
9a34a5d
19173b4
 
b3986a9
962aa9c
3ad5343
5f03eaa
 
bba9fab
b3986a9
27e1662
ba703e9
962aa9c
 
75fb8ef
962aa9c
75fb8ef
962aa9c
 
75fb8ef
962aa9c
 
 
 
b44fd2c
5f03eaa
825c475
3ad5343
 
5f03eaa
4857e6a
6465ea7
75fb8ef
95bd2d0
3ad5343
75fb8ef
 
7d67cb5
75fb8ef
 
 
5f03eaa
75fb8ef
 
3e9e2ab
75fb8ef
 
5f03eaa
 
75fb8ef
 
 
5f03eaa
75fb8ef
1879a3e
3ad5343
75fb8ef
 
 
1879a3e
75fb8ef
3ad5343
5f03eaa
75fb8ef
1879a3e
75fb8ef
 
7d67cb5
ba703e9
5280410
5f03eaa
 
 
 
3e9e2ab
3ad5343
 
 
5f03eaa
3ad5343
5f03eaa
3ad5343
5f03eaa
3ad5343
 
 
 
 
 
5f03eaa
ba703e9
3ad5343
5f03eaa
 
 
ba703e9
5f03eaa
 
 
 
 
 
 
 
 
 
 
 
 
ba703e9
5f03eaa
 
 
 
 
ba703e9
5f03eaa
 
 
 
 
 
ba703e9
 
5f03eaa
 
 
 
ba703e9
5f03eaa
 
71d678c
5f03eaa
ba703e9
5f03eaa
 
ba703e9
5f03eaa
 
 
ba703e9
5f03eaa
ba703e9
5f03eaa
 
ba703e9
71d678c
3ad5343
 
ba703e9
5f03eaa
3ad5343
71d678c
5f03eaa
 
 
ba703e9
71d678c
5f03eaa
 
1879a3e
 
5f03eaa
3ad5343
 
3e9e2ab
 
3ad5343
 
5f03eaa
 
ba703e9
5f03eaa
1879a3e
5f03eaa
 
 
 
71d678c
962aa9c
 
71d678c
5f03eaa
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad5343
ba703e9
3ad5343
 
 
 
ba703e9
3ad5343
5f03eaa
 
 
 
ba703e9
 
 
 
5f03eaa
 
ba703e9
 
5f03eaa
3ad5343
af41746
ba703e9
71d678c
e6e0279
5f03eaa
71d678c
5f03eaa
 
71d678c
962aa9c
ba703e9
71d678c
ba703e9
71d678c
 
5f03eaa
ba703e9
71d678c
 
5f03eaa
ba703e9
71d678c
 
3ad5343
71d678c
 
 
 
 
5f03eaa
 
ba703e9
5f03eaa
 
 
ba703e9
5f03eaa
71d678c
ba703e9
 
71d678c
 
ba703e9
71d678c
a1bb412
75fb8ef
ba703e9
3e9e2ab
ba703e9
3ad5343
ba703e9
5f03eaa
 
3ad5343
71d678c
 
 
3ad5343
ba703e9
71d678c

import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
import gc
import librosa
import soundfile as sf

warnings.filterwarnings("ignore")
os.environ["COQUI_TOS_AGREED"] = "1"
print("🚀 Starting FINAL CORRECTED Voice Cloning Studio...")

@contextmanager
def patch_torch_load():
    original_load = torch.load
    def patched_load(f, *args, **kwargs):
        kwargs['weights_only'] = False
        return original_load(f, *args, **kwargs)
    torch.load = patched_load
    try:
        yield
    finally:
        torch.load = original_load

# Hardware setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔥 Device: {DEVICE}")

# Global model variables
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"

def load_xtts_optimized():
    global TTS_MODEL, MODEL_STATUS
    if TTS_MODEL is not None:
        return True
    try:
        with patch_torch_load():
            from TTS.api import TTS
            print("📦 Loading XTTS...")
            TTS_MODEL = TTS(
                model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                progress_bar=False,
                gpu=(DEVICE == "cuda")
            )
            MODEL_STATUS = "XTTS-v2 Ready"
            print("✅ XTTS loaded successfully!")
            return True
    except Exception as e:
        print(f"❌ XTTS loading failed: {e}")
        MODEL_STATUS = f"XTTS Failed: {str(e)}"
        return False

def load_whisper_optimized():
    global WHISPER_MODEL
    if WHISPER_MODEL is not None:
        return True
    try:
        import whisper
        WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
        print("✅ Whisper loaded!")
        return True
    except Exception as e:
        print(f"❌ Whisper failed: {e}")
        return False

def optimize_audio_input(audio_path, max_duration=25):
    try:
        if not os.path.exists(audio_path):
            print(f"⚠️ Audio file not found: {audio_path}")
            return audio_path
            
        audio, sr = librosa.load(audio_path, sr=22050)
        max_samples = int(max_duration * sr)
        if len(audio) > max_samples:
            audio = audio[:max_samples]
            print(f"🔄 Audio trimmed to {max_duration}s")
        
        optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
        sf.write(optimized_path, audio, sr)
        print(f"✅ Audio optimized: {optimized_path}")
        return optimized_path
        
    except Exception as e:
        print(f"⚠️ Audio optimization failed: {e}")
        return audio_path

def safe_file_path(file_input, input_name="audio"):
    """Extract file path from various input formats"""
    try:
        if file_input is None:
            return None
            
        # If it's already a string path
        if isinstance(file_input, str):
            if os.path.exists(file_input):
                return file_input
            else:
                print(f"⚠️ File path doesn't exist: {file_input}")
                return None
        
        # If it's a file object with name attribute
        if hasattr(file_input, 'name'):
            file_path = file_input.name
            if file_path and os.path.exists(file_path):
                return file_path
        
        # If it's a dict-like object
        if hasattr(file_input, 'get'):
            file_path = file_input.get('name') or file_input.get('path')
            if file_path and os.path.exists(file_path):
                return file_path
        
        print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}")
        return None
        
    except Exception as e:
        print(f"❌ Error processing {input_name}: {e}")
        return None

def voice_to_voice_clone_final(reference_audio, input_audio, language="en"):
    """FINAL CORRECTED voice cloning function"""
    try:
        print(f"🎭 Voice cloning request: {language}")
        print(f"📁 Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
        
        # Extract file paths safely
        reference_path = safe_file_path(reference_audio, "reference")
        input_path = safe_file_path(input_audio, "input")
        
        if not reference_path:
            return None, "❌ Could not process reference audio file."
        
        if not input_path:
            return None, "❌ Could not process input audio file."
        
        print(f"📁 Processing files - Ref: {reference_path}, Input: {input_path}")
        
        # Validate files
        if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
            return None, "❌ Reference audio file is invalid."
            
        if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
            return None, "❌ Input audio file is invalid."
        
        # Load models
        if not load_xtts_optimized():
            return None, f"❌ XTTS model failed: {MODEL_STATUS}"
        
        load_whisper_optimized()
        
        # Optimize audio files
        print("🔄 Optimizing audio files...")
        ref_optimized = optimize_audio_input(reference_path, max_duration=20)
        input_optimized = optimize_audio_input(input_path, max_duration=25)
        
        # Transcribe input audio
        extracted_text = "This is a voice cloning demonstration."
        if WHISPER_MODEL:
            try:
                print("🎤 Transcribing audio...")
                with torch.no_grad():
                    result = WHISPER_MODEL.transcribe(
                        input_optimized,
                        fp16=(DEVICE == "cuda"),
                        language=language if language != 'auto' else None
                    )
                text = result.get("text", "").strip()
                if text and len(text) > 5:
                    extracted_text = text[:400]
                print(f"✅ Transcribed: '{extracted_text[:50]}...'")
            except Exception as e:
                print(f"⚠️ Transcription warning: {e}")
        
        # Generate cloned voice
        print("🚀 Generating cloned voice...")
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            output_path = tmp_file.name
        
        try:
            with patch_torch_load(), torch.no_grad():
                TTS_MODEL.tts_to_file(
                    text=extracted_text,
                    speaker_wav=ref_optimized,
                    language=language,
                    file_path=output_path,
                    temperature=0.7,
                    length_penalty=1.0,
                    repetition_penalty=5.0
                )
        except Exception as tts_error:
            print(f"❌ TTS generation error: {tts_error}")
            return None, f"❌ Voice generation failed: {str(tts_error)}"
        
        # Memory cleanup
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
        
        # Validate and return output
        if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
            file_size_kb = os.path.getsize(output_path) / 1024
            
            success_message = f"""✅ VOICE CLONING SUCCESS! 🎉

📝 Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
🎭 Device: {DEVICE} | Model: {MODEL_STATUS}
📊 Output: {file_size_kb:.1f} KB | Language: {language.upper()}
🔧 Optimizations Applied Successfully"""
            
            print("✅ Voice cloning completed successfully!")
            
            # CRITICAL FIX: Return file path directly for Gradio compatibility
            return output_path, success_message
            
        else:
            return None, "❌ Voice cloning failed - output file is empty."
            
    except Exception as e:
        error_msg = f"❌ Voice cloning error: {str(e)}"
        print(error_msg)
        import traceback
        print("Full traceback:", traceback.format_exc())
        return None, error_msg

# CRITICAL: Use gr.Interface (not Blocks) for better API compatibility
interface = gr.Interface(
    fn=voice_to_voice_clone_final,
    inputs=[
        gr.Audio(
            label="🎤 Reference Audio (Voice to Clone)",
            type="filepath"  # CRITICAL: Must be filepath for API compatibility
        ),
        gr.Audio(
            label="🎵 Input Audio (Content to Transform)", 
            type="filepath"  # CRITICAL: Must be filepath for API compatibility
        ),
        gr.Dropdown(
            choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
            value="en",
            label="🌍 Language"
        )
    ],
    outputs=[
        gr.Audio(
            label="🎉 Cloned Voice Result",
            type="filepath"  # CRITICAL: Must be filepath for proper return
        ),
        gr.Textbox(
            label="📋 Processing Status",
            lines=8
        )
    ],
    title="🎭 AI Voice Cloning Studio - FINAL",
    description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).",
    theme=gr.themes.Soft(),
    allow_flagging="never",
    api_name="voice_to_voice_clone"  # CRITICAL: API endpoint name
)

if __name__ == "__main__":
    print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...")
    
    # CORRECTED: Proper queue configuration
    interface.queue(
        max_size=2,  # Reduced for stability
        api_open=True,
        default_concurrency_limit=1
    ).launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_api=True,
        debug=False  # Disable debug for production
    )