import gradio as gr import torch import torchaudio import tempfile import os import warnings from contextlib import contextmanager import gc import librosa import soundfile as sf warnings.filterwarnings("ignore") os.environ["COQUI_TOS_AGREED"] = "1" print("🚀 Starting FINAL CORRECTED Voice Cloning Studio...") @contextmanager def patch_torch_load(): original_load = torch.load def patched_load(f, *args, **kwargs): kwargs['weights_only'] = False return original_load(f, *args, **kwargs) torch.load = patched_load try: yield finally: torch.load = original_load # Hardware setup DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"🔥 Device: {DEVICE}") # Global model variables TTS_MODEL = None WHISPER_MODEL = None MODEL_STATUS = "Not Loaded" def load_xtts_optimized(): global TTS_MODEL, MODEL_STATUS if TTS_MODEL is not None: return True try: with patch_torch_load(): from TTS.api import TTS print("📦 Loading XTTS...") TTS_MODEL = TTS( model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=(DEVICE == "cuda") ) MODEL_STATUS = "XTTS-v2 Ready" print("✅ XTTS loaded successfully!") return True except Exception as e: print(f"❌ XTTS loading failed: {e}") MODEL_STATUS = f"XTTS Failed: {str(e)}" return False def load_whisper_optimized(): global WHISPER_MODEL if WHISPER_MODEL is not None: return True try: import whisper WHISPER_MODEL = whisper.load_model("base", device=DEVICE) print("✅ Whisper loaded!") return True except Exception as e: print(f"❌ Whisper failed: {e}") return False def optimize_audio_input(audio_path, max_duration=25): try: if not os.path.exists(audio_path): print(f"⚠️ Audio file not found: {audio_path}") return audio_path audio, sr = librosa.load(audio_path, sr=22050) max_samples = int(max_duration * sr) if len(audio) > max_samples: audio = audio[:max_samples] print(f"🔄 Audio trimmed to {max_duration}s") optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav') sf.write(optimized_path, audio, sr) print(f"✅ Audio optimized: {optimized_path}") return optimized_path except Exception as e: print(f"⚠️ Audio optimization failed: {e}") return audio_path def safe_file_path(file_input, input_name="audio"): """Extract file path from various input formats""" try: if file_input is None: return None # If it's already a string path if isinstance(file_input, str): if os.path.exists(file_input): return file_input else: print(f"⚠️ File path doesn't exist: {file_input}") return None # If it's a file object with name attribute if hasattr(file_input, 'name'): file_path = file_input.name if file_path and os.path.exists(file_path): return file_path # If it's a dict-like object if hasattr(file_input, 'get'): file_path = file_input.get('name') or file_input.get('path') if file_path and os.path.exists(file_path): return file_path print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}") return None except Exception as e: print(f"❌ Error processing {input_name}: {e}") return None def voice_to_voice_clone_final(reference_audio, input_audio, language="en"): """FINAL CORRECTED voice cloning function""" try: print(f"🎭 Voice cloning request: {language}") print(f"📁 Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}") # Extract file paths safely reference_path = safe_file_path(reference_audio, "reference") input_path = safe_file_path(input_audio, "input") if not reference_path: return None, "❌ Could not process reference audio file." if not input_path: return None, "❌ Could not process input audio file." print(f"📁 Processing files - Ref: {reference_path}, Input: {input_path}") # Validate files if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000: return None, "❌ Reference audio file is invalid." if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000: return None, "❌ Input audio file is invalid." # Load models if not load_xtts_optimized(): return None, f"❌ XTTS model failed: {MODEL_STATUS}" load_whisper_optimized() # Optimize audio files print("🔄 Optimizing audio files...") ref_optimized = optimize_audio_input(reference_path, max_duration=20) input_optimized = optimize_audio_input(input_path, max_duration=25) # Transcribe input audio extracted_text = "This is a voice cloning demonstration." if WHISPER_MODEL: try: print("🎤 Transcribing audio...") with torch.no_grad(): result = WHISPER_MODEL.transcribe( input_optimized, fp16=(DEVICE == "cuda"), language=language if language != 'auto' else None ) text = result.get("text", "").strip() if text and len(text) > 5: extracted_text = text[:400] print(f"✅ Transcribed: '{extracted_text[:50]}...'") except Exception as e: print(f"⚠️ Transcription warning: {e}") # Generate cloned voice print("🚀 Generating cloned voice...") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: output_path = tmp_file.name try: with patch_torch_load(), torch.no_grad(): TTS_MODEL.tts_to_file( text=extracted_text, speaker_wav=ref_optimized, language=language, file_path=output_path, temperature=0.7, length_penalty=1.0, repetition_penalty=5.0 ) except Exception as tts_error: print(f"❌ TTS generation error: {tts_error}") return None, f"❌ Voice generation failed: {str(tts_error)}" # Memory cleanup if DEVICE == "cuda": torch.cuda.empty_cache() gc.collect() # Validate and return output if os.path.exists(output_path) and os.path.getsize(output_path) > 1000: file_size_kb = os.path.getsize(output_path) / 1024 success_message = f"""✅ VOICE CLONING SUCCESS! 🎉 📝 Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}" 🎭 Device: {DEVICE} | Model: {MODEL_STATUS} 📊 Output: {file_size_kb:.1f} KB | Language: {language.upper()} 🔧 Optimizations Applied Successfully""" print("✅ Voice cloning completed successfully!") # CRITICAL FIX: Return file path directly for Gradio compatibility return output_path, success_message else: return None, "❌ Voice cloning failed - output file is empty." except Exception as e: error_msg = f"❌ Voice cloning error: {str(e)}" print(error_msg) import traceback print("Full traceback:", traceback.format_exc()) return None, error_msg # CRITICAL: Use gr.Interface (not Blocks) for better API compatibility interface = gr.Interface( fn=voice_to_voice_clone_final, inputs=[ gr.Audio( label="🎤 Reference Audio (Voice to Clone)", type="filepath" # CRITICAL: Must be filepath for API compatibility ), gr.Audio( label="🎵 Input Audio (Content to Transform)", type="filepath" # CRITICAL: Must be filepath for API compatibility ), gr.Dropdown( choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"], value="en", label="🌍 Language" ) ], outputs=[ gr.Audio( label="🎉 Cloned Voice Result", type="filepath" # CRITICAL: Must be filepath for proper return ), gr.Textbox( label="📋 Processing Status", lines=8 ) ], title="🎭 AI Voice Cloning Studio - FINAL", description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).", theme=gr.themes.Soft(), allow_flagging="never", api_name="voice_to_voice_clone" # CRITICAL: API endpoint name ) if __name__ == "__main__": print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...") # CORRECTED: Proper queue configuration interface.queue( max_size=2, # Reduced for stability api_open=True, default_concurrency_limit=1 ).launch( server_name="0.0.0.0", server_port=7860, share=False, show_api=True, debug=False # Disable debug for production )