Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import tempfile | |
| import os | |
| import warnings | |
| from contextlib import contextmanager | |
| warnings.filterwarnings("ignore") | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| print("π Starting Voice Cloning Studio...") | |
| def patch_torch_load(): | |
| original_load = torch.load | |
| def patched_load(f, *args, **kwargs): | |
| kwargs['weights_only'] = False | |
| return original_load(f, *args, **kwargs) | |
| torch.load = patched_load | |
| try: | |
| yield | |
| finally: | |
| torch.load = original_load | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| TTS_MODEL = None | |
| WHISPER_MODEL = None | |
| MODEL_STATUS = "Not Loaded" | |
| def load_xtts_manual(): | |
| global TTS_MODEL, MODEL_STATUS | |
| if TTS_MODEL is not None: | |
| return True | |
| try: | |
| with patch_torch_load(): | |
| from TTS.api import TTS | |
| print("π¦ Loading XTTS...") | |
| TTS_MODEL = TTS( | |
| model_name="tts_models/multilingual/multi-dataset/xtts_v2", | |
| progress_bar=True, | |
| gpu=(DEVICE == "cuda") | |
| ) | |
| MODEL_STATUS = "XTTS-v2 Ready" | |
| print("β XTTS loaded!") | |
| return True | |
| except Exception as e: | |
| print(f"β XTTS loading failed: {e}") | |
| MODEL_STATUS = f"Manual Failed: {str(e)}" | |
| return False | |
| def load_whisper(): | |
| global WHISPER_MODEL | |
| if WHISPER_MODEL is not None: | |
| return True | |
| try: | |
| import whisper | |
| WHISPER_MODEL = whisper.load_model("base") | |
| print("β Whisper loaded!") | |
| return True | |
| except Exception as e: | |
| print(f"β Whisper failed: {e}") | |
| return False | |
| def voice_to_voice_clone(reference_audio, input_audio, language="en"): | |
| try: | |
| if not reference_audio or not input_audio: | |
| return None, "β Please upload both reference and input audio files!" | |
| if not load_xtts_manual(): | |
| return None, f"β XTTS loading failed!\nStatus: {MODEL_STATUS}" | |
| load_whisper() | |
| extracted_text = "Voice cloning demonstration." | |
| if WHISPER_MODEL: | |
| try: | |
| result = WHISPER_MODEL.transcribe(input_audio) | |
| text = result.get("text", "").strip() | |
| if text and len(text) > 3: | |
| extracted_text = text | |
| print(f"β Extracted: '{extracted_text[:100]}...'") | |
| except Exception as e: | |
| print(f"β οΈ Whisper error: {e}") | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
| output_path = tmp_file.name | |
| with patch_torch_load(): | |
| TTS_MODEL.tts_to_file( | |
| text=extracted_text, | |
| speaker_wav=reference_audio, | |
| language=language, | |
| file_path=output_path | |
| ) | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 0: | |
| return output_path, f"""β VOICE-TO-VOICE CLONING SUCCESS! | |
| π Content: '{extracted_text[:150]}...' | |
| π Device: {DEVICE} | |
| π§ Status: {MODEL_STATUS} | |
| """ | |
| else: | |
| return None, "β Generated audio file is empty!" | |
| except Exception as e: | |
| return None, f"β Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}" | |
| # Gradio Interface | |
| with gr.Blocks(title="Voice Cloning Studio") as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 25px;"> | |
| <h1>π REAL Voice Cloning Studio</h1> | |
| <p>Status: Models load on first use</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| reference_audio = gr.Audio( | |
| label="π€ Reference Audio (Voice to Clone)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| input_audio = gr.Audio( | |
| label="π΅ Input Audio (Content to Transform)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| language = gr.Dropdown( | |
| choices=[ | |
| ("English", "en"), | |
| ("Spanish", "es"), | |
| ("French", "fr"), | |
| ("German", "de") | |
| ], | |
| value="en", | |
| label="Language" | |
| ) | |
| clone_btn = gr.Button("Clone Voice", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="Cloned Voice Result") | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=12, | |
| interactive=False | |
| ) | |
| clone_btn.click( | |
| fn=voice_to_voice_clone, | |
| inputs=[reference_audio, input_audio, language], | |
| outputs=[output_audio, status_output], | |
| show_progress=True | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |