File size: 1,558 Bytes
78e0477
b4a0f45
9bfb45c
 
 
efdfe4c
b4a0f45
78e0477
b4a0f45
9bfb45c
 
 
 
 
b63c5c0
9bfb45c
 
 
 
b63c5c0
78e0477
efdfe4c
8e1bb8b
b63c5c0
 
 
9bfb45c
78e0477
086899b
 
8e1bb8b
 
 
b63c5c0
8e1bb8b
b63c5c0
8e1bb8b
b63c5c0
efdfe4c
b63c5c0
8e1bb8b
086899b
5942862
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
import gradio as gr
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from huggingface_hub import snapshot_download
import spaces

os.environ["COQUI_TOS_AGREED"] = "1"

print("Downloading XTTS-v2 model...")
repo_id = "XTTS-v2"
if not os.path.exists(repo_id):
    snapshot_download(repo_id="coqui/XTTS-v2", local_dir=repo_id, allow_patterns=["*.json", "*.pth", "*.wav"])

print("Loading model on CPU...")
config = XttsConfig()
config.load_json("XTTS-v2/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="XTTS-v2", eval=True)
print("Model loaded successfully on CPU!")

@spaces.GPU
def clone_voice(text, language, reference_audio):
    if not text or not reference_audio:
        return None
    outputs = model.synthesize(text, config, speaker_wav=reference_audio, language=language, enable_text_splitting=True)
    return (24000, outputs["wav"])

LANGS = ["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja","ko","hu"]

demo = gr.Interface(
    fn=clone_voice,
    inputs=[
        gr.Textbox(label="Text", placeholder="Enter text to synthesize..."),
        gr.Dropdown(choices=LANGS, value="fr", label="Language"),
        gr.Audio(label="Reference Audio (3-15 seconds)", type="filepath")
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="XTTS v2 Voice Cloning",
    description="Clone any voice from a 3-15 second audio sample. Supports 16 languages."
)

demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)