File size: 1,558 Bytes
78e0477 b4a0f45 9bfb45c efdfe4c b4a0f45 78e0477 b4a0f45 9bfb45c b63c5c0 9bfb45c b63c5c0 78e0477 efdfe4c 8e1bb8b b63c5c0 9bfb45c 78e0477 086899b 8e1bb8b b63c5c0 8e1bb8b b63c5c0 8e1bb8b b63c5c0 efdfe4c b63c5c0 8e1bb8b 086899b 5942862 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import os
import gradio as gr
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from huggingface_hub import snapshot_download
import spaces
os.environ["COQUI_TOS_AGREED"] = "1"
print("Downloading XTTS-v2 model...")
repo_id = "XTTS-v2"
if not os.path.exists(repo_id):
snapshot_download(repo_id="coqui/XTTS-v2", local_dir=repo_id, allow_patterns=["*.json", "*.pth", "*.wav"])
print("Loading model on CPU...")
config = XttsConfig()
config.load_json("XTTS-v2/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="XTTS-v2", eval=True)
print("Model loaded successfully on CPU!")
@spaces.GPU
def clone_voice(text, language, reference_audio):
if not text or not reference_audio:
return None
outputs = model.synthesize(text, config, speaker_wav=reference_audio, language=language, enable_text_splitting=True)
return (24000, outputs["wav"])
LANGS = ["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja","ko","hu"]
demo = gr.Interface(
fn=clone_voice,
inputs=[
gr.Textbox(label="Text", placeholder="Enter text to synthesize..."),
gr.Dropdown(choices=LANGS, value="fr", label="Language"),
gr.Audio(label="Reference Audio (3-15 seconds)", type="filepath")
],
outputs=gr.Audio(label="Generated Speech"),
title="XTTS v2 Voice Cloning",
description="Clone any voice from a 3-15 second audio sample. Supports 16 languages."
)
demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False) |