Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import warnings | |
| from contextlib import contextmanager | |
| warnings.filterwarnings("ignore") | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| print("π Starting Voice Cloning Studio...") | |
| def patch_torch_load(): | |
| original_load = torch.load | |
| def patched_load(f, *args, **kwargs): | |
| kwargs['weights_only'] = False | |
| return original_load(f, *args, **kwargs) | |
| torch.load = patched_load | |
| try: | |
| yield | |
| finally: | |
| torch.load = original_load | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| TTS_MODEL = None | |
| WHISPER_MODEL = None | |
| MODEL_STATUS = "Not Loaded" | |
| def load_xtts_manual(): | |
| global TTS_MODEL, MODEL_STATUS | |
| if TTS_MODEL is not None: | |
| return True | |
| try: | |
| with patch_torch_load(): | |
| from TTS.api import TTS | |
| print("π¦ Loading XTTS...") | |
| TTS_MODEL = TTS( | |
| model_name="tts_models/multilingual/multi-dataset/xtts_v2", | |
| progress_bar=True, | |
| gpu=(DEVICE == "cuda") | |
| ) | |
| MODEL_STATUS = "XTTS-v2 Ready" | |
| print("β XTTS loaded!") | |
| return True | |
| except Exception as e: | |
| print(f"β XTTS loading failed: {e}") | |
| MODEL_STATUS = f"Manual Failed: {str(e)}" | |
| return False | |
| def load_whisper(): | |
| global WHISPER_MODEL | |
| if WHISPER_MODEL is not None: | |
| return True | |
| try: | |
| import whisper | |
| WHISPER_MODEL = whisper.load_model("base") | |
| print("β Whisper loaded!") | |
| return True | |
| except Exception as e: | |
| print(f"β Whisper failed: {e}") | |
| return False | |
| def voice_to_voice_clone(reference_audio, input_audio, language="en"): | |
| """ | |
| Main voice cloning function - this will be called by both UI and API | |
| """ | |
| try: | |
| print(f"π Voice cloning request: {language}") | |
| print(f"π Reference: {reference_audio}") | |
| print(f"π Input: {input_audio}") | |
| if not reference_audio or not input_audio: | |
| return None, "β Please upload both reference and input audio files!" | |
| # Load XTTS model | |
| if not load_xtts_manual(): | |
| return None, f"β XTTS loading failed!\nStatus: {MODEL_STATUS}" | |
| # Load Whisper for transcription | |
| load_whisper() | |
| # Extract text from input audio | |
| extracted_text = "Voice cloning demonstration." | |
| if WHISPER_MODEL: | |
| try: | |
| result = WHISPER_MODEL.transcribe(input_audio) | |
| text = result.get("text", "").strip() | |
| if text and len(text) > 3: | |
| extracted_text = text | |
| print(f"β Extracted: '{extracted_text[:100]}...'") | |
| except Exception as e: | |
| print(f"β οΈ Whisper error: {e}") | |
| # Generate cloned voice | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| print(f"π Generating voice clone...") | |
| with patch_torch_load(): | |
| TTS_MODEL.tts_to_file( | |
| text=extracted_text, | |
| speaker_wav=reference_audio, | |
| language=language, | |
| file_path=output_path | |
| ) | |
| # Verify output | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| success_message = f"""β VOICE-TO-VOICE CLONING SUCCESS! | |
| π Content: '{extracted_text[:150]}...' | |
| π Device: {DEVICE} | |
| π§ Status: {MODEL_STATUS} | |
| π Output size: {os.path.getsize(output_path)} bytes | |
| """ | |
| print("β Voice cloning completed successfully!") | |
| return output_path, success_message | |
| else: | |
| return None, "β Generated audio file is empty!" | |
| except Exception as e: | |
| error_msg = f"β Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}" | |
| print(error_msg) | |
| return None, error_msg | |
| # FIXED: Use gr.Interface instead of gr.Blocks for proper API exposure | |
| interface = gr.Interface( | |
| fn=voice_to_voice_clone, | |
| inputs=[ | |
| gr.Audio( | |
| label="π€ Reference Audio (Voice to Clone)", | |
| type="filepath", | |
| sources=["upload"] | |
| ), | |
| gr.Audio( | |
| label="π΅ Input Audio (Content to Transform)", | |
| type="filepath", | |
| sources=["upload"] | |
| ), | |
| gr.Dropdown( | |
| choices=[ | |
| "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", | |
| "cs", "ar", "zh", "ja", "ko", "hi", "uk", "vi", "ro", "el", | |
| "he", "fi", "hu", "sv", "ca", "id", "ms", "bg", "sk", "da", | |
| "no", "lt", "hr", "sr", "sl", "et", "lv", "fil", "bn", "ta", | |
| "te", "ur", "fa", "th" | |
| ], | |
| value="en", | |
| label="π Language" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.Audio(label="π Cloned Voice Result"), | |
| gr.Textbox(label="π Status", lines=8) | |
| ], | |
| title="π REAL Voice Cloning Studio", | |
| description="Transform any voice into any other voice using XTTS-v2 and Whisper AI models. Upload reference audio and input audio to get started.", | |
| theme=gr.themes.Soft(), | |
| allow_flagging="never", | |
| api_name="voice_to_voice_clone" # CRITICAL: This creates the API endpoint | |
| ) | |
| if __name__ == "__main__": | |
| print("π Launching Voice Cloning Studio...") | |
| interface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_api=True, # Shows API documentation | |
| debug=True | |
| ) | |