# app.py import os import tempfile import torch import gradio as gr from TTS.api import TTS # Patch torch.load for compatibility with older Coqui checkpoints old_torch_load = torch.load def patched_torch_load(*args, **kwargs): kwargs["weights_only"] = False return old_torch_load(*args, **kwargs) torch.load = patched_torch_load # Accept Coqui TOS os.environ["COQUI_TOS_AGREED"] = "1" # Ensure speakers folder exists SPEAKER_DIR = "speakers" os.makedirs(SPEAKER_DIR, exist_ok=True) # Get device device = "cuda" if torch.cuda.is_available() else "cpu" # Model MODEL = "tts_models/multilingual/multi-dataset/xtts_v2" print("Loading model:", MODEL) tts = TTS(MODEL).to(device) # Supported languages LANGS = [ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi" ] def generate_audio(text, language, speaker_file): if not text or len(text.strip()) < 2: return None out_path = tempfile.mktemp(suffix=".wav") # Resolve speaker path (either from upload or from speakers folder) speaker_path = None if speaker_file: speaker_path = speaker_file else: # Default to first speaker file in folder if exists files = [f for f in os.listdir(SPEAKER_DIR) if f.lower().endswith(".wav")] if files: speaker_path = os.path.join(SPEAKER_DIR, files[0]) if speaker_path: tts.tts_to_file( text=text, speaker_wav=speaker_path, language=language, file_path=out_path ) else: return None return out_path demo = gr.Interface( fn=generate_audio, inputs=[ gr.Textbox(lines=3, label="Text"), gr.Dropdown(LANGS, value="en", label="Language"), gr.Audio(label="Upload speaker reference (optional)", type="filepath") ], outputs=gr.Audio(type="filepath", label="Generated Speech"), title="XTTS-v2 Voice Cloning", description=f"Drop WAV files into `{SPEAKER_DIR}` folder for reusable speaker voices.", allow_flagging="never", ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)