File size: 2,165 Bytes
e6a706e
f8749a0
 
 
 
 
 
a279c11
65b0e67
 
 
 
 
 
7a10ff5
f8749a0
 
a279c11
 
 
 
 
 
 
 
e6a706e
f8749a0
a279c11
e6a706e
7a10ff5
e6a706e
 
 
 
 
a279c11
f8749a0
 
e6a706e
f8749a0
e6a706e
a279c11
 
 
 
e6a706e
a279c11
 
 
 
 
 
7a10ff5
 
a279c11
7a10ff5
 
 
a279c11
 
e6a706e
f8749a0
 
 
 
 
 
e6a706e
a279c11
f8749a0
a279c11
 
 
f8749a0
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# app.py
import os
import tempfile
import torch
import gradio as gr
from TTS.api import TTS

# Patch torch.load for compatibility with older Coqui checkpoints
old_torch_load = torch.load
def patched_torch_load(*args, **kwargs):
    kwargs["weights_only"] = False
    return old_torch_load(*args, **kwargs)
torch.load = patched_torch_load

# Accept Coqui TOS
os.environ["COQUI_TOS_AGREED"] = "1"

# Ensure speakers folder exists
SPEAKER_DIR = "speakers"
os.makedirs(SPEAKER_DIR, exist_ok=True)

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model
MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
print("Loading model:", MODEL)
tts = TTS(MODEL).to(device)

# Supported languages
LANGS = [
    "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
    "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"
]

def generate_audio(text, language, speaker_file):
    if not text or len(text.strip()) < 2:
        return None

    out_path = tempfile.mktemp(suffix=".wav")

    # Resolve speaker path (either from upload or from speakers folder)
    speaker_path = None
    if speaker_file:
        speaker_path = speaker_file
    else:
        # Default to first speaker file in folder if exists
        files = [f for f in os.listdir(SPEAKER_DIR) if f.lower().endswith(".wav")]
        if files:
            speaker_path = os.path.join(SPEAKER_DIR, files[0])

    if speaker_path:
        tts.tts_to_file(
            text=text,
            speaker_wav=speaker_path,
            language=language,
            file_path=out_path
        )
    else:
        return None

    return out_path

demo = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.Textbox(lines=3, label="Text"),
        gr.Dropdown(LANGS, value="en", label="Language"),
        gr.Audio(label="Upload speaker reference (optional)", type="filepath")
    ],
    outputs=gr.Audio(type="filepath", label="Generated Speech"),
    title="XTTS-v2 Voice Cloning",
    description=f"Drop WAV files into `{SPEAKER_DIR}` folder for reusable speaker voices.",
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)