File size: 3,765 Bytes
5fd8115
6882130
5fd8115
f1134ba
8750e5c
f1134ba
8538350
 
8750e5c
6882130
5fd8115
f1134ba
5fd8115
f1134ba
5fd8115
 
 
01ae4f9
40ff39d
8538350
 
8750e5c
40ff39d
5fd8115
 
8538350
5fd8115
40ff39d
5fd8115
f1134ba
40ff39d
f9affcb
40ff39d
 
5fd8115
8538350
5fd8115
f1134ba
4e5e923
 
 
 
 
 
 
 
40ff39d
b9aa16e
 
 
 
5fd8115
40ff39d
8538350
 
b9aa16e
8538350
 
 
b9aa16e
40ff39d
 
4e5e923
 
 
 
40ff39d
f1134ba
8538350
4e5e923
f1134ba
4e5e923
f1134ba
 
 
 
4e5e923
 
f1134ba
 
 
4e5e923
f1134ba
 
 
 
 
 
 
 
 
 
 
 
 
8538350
 
 
f1134ba
 
 
8538350
 
f1134ba
 
8538350
4e5e923
b166fd7
f1134ba
8538350
f1134ba
8538350
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import torch
import torchaudio
import threading
import gradio as gr
import requests
from fastapi import FastAPI
from fastapi.responses import FileResponse
from huggingface_hub import snapshot_download, hf_hub_download
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import uvicorn

# ===== MODEL SETUP =====
checkpoint_dir = "model/"
repo_id = "capleaf/viXTTS"
os.makedirs(checkpoint_dir, exist_ok=True)
api_app = FastAPI()

required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
if not all(f in os.listdir(checkpoint_dir) for f in required_files):
    snapshot_download(repo_id=repo_id, local_dir=checkpoint_dir)
    hf_hub_download("coqui/XTTS-v2", "speakers_xtts.pth", local_dir=checkpoint_dir)

config = XttsConfig()
config.load_json(os.path.join(checkpoint_dir, "config.json"))
MODEL = Xtts.init_from_config(config)
MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=False)

# CPU-only
MODEL.cpu()
MODEL.gpt.float()
torch.set_num_threads(4)
torch.backends.mkldnn.enabled = True

LANGS = ["vi", "en", "zh-cn", "ja", "ko"]

# ===== TTS FUNCTION =====
DEFAULT_REF = "model/samples/nu-luu-loat.wav"  # file mẫu mặc định

def tts_fn(text, language):
    ref_audio = DEFAULT_REF
    import os
    print(">>> Server-side ref_audio path:", os.path.abspath(ref_audio))
    print(">>> Exists:", os.path.exists(ref_audio))

    gpt_latent, spk_embed = MODEL.get_conditioning_latents(
        audio_path=ref_audio,
        gpt_cond_len=18,
        gpt_cond_chunk_len=4,
        max_ref_length=50
    )
    out = MODEL.inference(
        text=text,
        language=language,
        gpt_cond_latent=gpt_latent,  
        speaker_embedding=spk_embed,
        temperature=0.65,
        repetition_penalty=2.5,
        enable_text_splitting=True
    )
    wav = torch.tensor(out["wav"]).unsqueeze(0)
    out_path = "output.wav"
    torchaudio.save(out_path, wav, 24000)
    print(">>> Generated wav path:", os.path.abspath(out_path))
    return out_path

# ===== FASTAPI SERVER =====
@api_app.post("/api/speak")
def speak_api(text: str, language: str = "vi"):
    try:
        path = tts_fn(text, language)
        return FileResponse(path, media_type="audio/wav")
    except Exception as e:
        return {"error": str(e)}

# ===== GRADIO CLIENT =====
def gradio_client(text, language):
    try:
        r = requests.post(
            "http://127.0.0.1:8000/api/speak",
            params={"text": text, "language": language}
        )
        if r.status_code == 200:
            with open("voice.wav", "wb") as f:
                f.write(r.content)
            return "voice.wav", "✅ Hoàn tất!"
        else:
            return None, f"❌ Lỗi API: {r.status_code}"
    except Exception as e:
        return None, f"❌ Lỗi: {str(e)}"

# ===== GRADIO UI =====
with gr.Blocks(title="ViXTTS - Gradio + API") as demo:
    gr.Markdown("## 🎙️ Vietnamese TTS - CPU (Spaces HuggingFace)")

    with gr.Row():
        with gr.Column(scale=1):
            text_in = gr.Textbox(label="Văn bản", value="Xin chào!", lines=4)
            lang_dd = gr.Dropdown(label="Ngôn ngữ", choices=LANGS, value="vi")
            btn = gr.Button("🎧 Tạo giọng")

        with gr.Column(scale=1):
            audio_out = gr.Audio(label="Kết quả", autoplay=True)
            info_out = gr.Textbox(label="Trạng thái", interactive=False)

    btn.click(gradio_client, inputs=[text_in, lang_dd], outputs=[audio_out, info_out])

# ===== CHẠY SONG SONG API + GRADIO =====
if __name__ == "__main__":
    threading.Thread(target=lambda: uvicorn.run(api_app, host="0.0.0.0", port=8000), daemon=True).start()
    demo.launch(server_name="0.0.0.0", server_port=7860)