xtts-space / app.py
immad84's picture
update app.py
a279c11 verified
# app.py
import os
import tempfile
import torch
import gradio as gr
from TTS.api import TTS
# Patch torch.load for compatibility with older Coqui checkpoints
old_torch_load = torch.load
def patched_torch_load(*args, **kwargs):
kwargs["weights_only"] = False
return old_torch_load(*args, **kwargs)
torch.load = patched_torch_load
# Accept Coqui TOS
os.environ["COQUI_TOS_AGREED"] = "1"
# Ensure speakers folder exists
SPEAKER_DIR = "speakers"
os.makedirs(SPEAKER_DIR, exist_ok=True)
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Model
MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
print("Loading model:", MODEL)
tts = TTS(MODEL).to(device)
# Supported languages
LANGS = [
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
"cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"
]
def generate_audio(text, language, speaker_file):
if not text or len(text.strip()) < 2:
return None
out_path = tempfile.mktemp(suffix=".wav")
# Resolve speaker path (either from upload or from speakers folder)
speaker_path = None
if speaker_file:
speaker_path = speaker_file
else:
# Default to first speaker file in folder if exists
files = [f for f in os.listdir(SPEAKER_DIR) if f.lower().endswith(".wav")]
if files:
speaker_path = os.path.join(SPEAKER_DIR, files[0])
if speaker_path:
tts.tts_to_file(
text=text,
speaker_wav=speaker_path,
language=language,
file_path=out_path
)
else:
return None
return out_path
demo = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(lines=3, label="Text"),
gr.Dropdown(LANGS, value="en", label="Language"),
gr.Audio(label="Upload speaker reference (optional)", type="filepath")
],
outputs=gr.Audio(type="filepath", label="Generated Speech"),
title="XTTS-v2 Voice Cloning",
description=f"Drop WAV files into `{SPEAKER_DIR}` folder for reusable speaker voices.",
allow_flagging="never",
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)