Spaces:

nxhong
/

vixtts-api

Sleeping

App Files Files Community

nxhong commited on Oct 29, 2025

Commit

40ff39d

verified ·

1 Parent(s): 38d1a97

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -157

app.py CHANGED Viewed

@@ -1,181 +1,75 @@
 import os
-import time
-import threading
 import torch
 import torchaudio
 import gradio as gr
-import spaces
-from fastapi import FastAPI
-from fastapi.responses import FileResponse
-import uvicorn
 from huggingface_hub import snapshot_download, hf_hub_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
-from vinorm import TTSnorm
-# ========== SETUP MODEL ==========
-print("🔽 Đang tải mô hình capleaf/viXTTS...")
 checkpoint_dir = "model/"
 repo_id = "capleaf/viXTTS"
-use_deepspeed = False
 os.makedirs(checkpoint_dir, exist_ok=True)
-required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
-files_in_dir = os.listdir(checkpoint_dir)
-if not all(file in files_in_dir for file in required_files):
     snapshot_download(repo_id=repo_id, local_dir=checkpoint_dir)
-    hf_hub_download(
-        repo_id="coqui/XTTS-v2",
-        filename="speakers_xtts.pth",
-        local_dir=checkpoint_dir,
-    )
-xtts_config = os.path.join(checkpoint_dir, "config.json")
 config = XttsConfig()
-config.load_json(xtts_config)
 MODEL = Xtts.init_from_config(config)
-MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed)
-if torch.cuda.is_available():
-    MODEL.cuda()
-supported_languages = config.languages
-if "vi" not in supported_languages:
-    supported_languages.append("vi")
-# ========== UTILITIES ==========
-def normalize_vietnamese_text(text):
-    text = (
-        TTSnorm(text, unknown=False, lower=False, rule=True)
-        .replace("..", ".")
-        .replace("!.", "!")
-        .replace("?.", "?")
-        .replace(" .", ".")
-        .replace(" ,", ",")
-        .replace('"', "")
-        .replace("'", "")
-        .replace("AI", "Ây Ai")
-        .replace("A.I", "Ây Ai")
-    )
-    return text
-def calculate_keep_len(text, lang):
-    if lang in ["ja", "zh-cn"]:
-        return -1
-    word_count = len(text.split())
-    num_punct = text.count(".") + text.count("!") + text.count("?") + text.count(",")
-    if word_count < 5:
-        return 15000 * word_count + 2000 * num_punct
-    elif word_count < 10:
-        return 13000 * word_count + 2000 * num_punct
-    return -1
 # ========== TTS FUNCTION ==========
-@spaces.GPU
-def predict(text, language, ref_audio, normalize_text=True):
-    if not text or len(text.strip()) == 0:
-        return None, "⚠️ Vui lòng nhập nội dung văn bản."
-    if language not in supported_languages:
-        return None, f"❌ Ngôn ngữ '{language}' không được hỗ trợ."
-    try:
-        print(f"🎧 Đang sinh giọng nói [{language}] cho văn bản: {text[:50]}...")
-        (gpt_cond_latent, speaker_embedding) = MODEL.get_conditioning_latents(
-            audio_path=ref_audio,
-            gpt_cond_len=30,
-            gpt_cond_chunk_len=4,
-            max_ref_length=60,
-        )
-        if normalize_text and language == "vi":
-            text = normalize_vietnamese_text(text)
-        t0 = time.time()
-        out = MODEL.inference(
-            text,
-            language,
-            gpt_cond_latent,
-            speaker_embedding,
-            repetition_penalty=5.0,
-            temperature=0.75,
-            enable_text_splitting=True,
-        )
-        inference_time = time.time() - t0
-        rtf = (time.time() - t0) / out["wav"].shape[-1] * 24000
-        keep_len = calculate_keep_len(text, language)
-        out["wav"] = out["wav"][:keep_len]
-        torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
-        info = f"⏱️ Thời gian sinh âm: {round(inference_time, 2)}s\n⚙️ RTF: {rtf:.2f}"
-        return "output.wav", info
-    except Exception as e:
-        print("❌ Error:", str(e))
-        return None, f"Lỗi khi sinh giọng nói: {str(e)}"
-# ========== FASTAPI ==========
-api_app = FastAPI()
-@api_app.post("/api/speak")
-def speak_api(text: str = "Xin chào!", language: str = "vi"):
-    ref_audio = "model/samples/nu-luu-loat.wav"
-    audio_path, _ = predict(text, language, ref_audio, True)
-    return FileResponse(audio_path, media_type="audio/wav")
-# ========== GRADIO UI ==========
-with gr.Blocks(title="🇻🇳 Vietnamese TTS - capleaf/viXTTS") as demo:
-    gr.Markdown("## 🎙️ Text to Speech (ViXTTS)")
-    gr.Markdown("Nhập văn bản, chọn ngôn ngữ và giọng mẫu để tạo giọng nói.")
-    with gr.Row():
-        with gr.Column(scale=1):
-            input_text = gr.Textbox(
-                label="Văn bản cần đọc",
-                value="Xin chào! Tôi là mô hình tạo giọng nói tiếng Việt.",
-                lines=4,
-            )
-            lang_dd = gr.Dropdown(
-                label="Ngôn ngữ",
-                choices=["vi", "en", "zh-cn", "ja", "ko"],
-                value="vi",
-            )
-            ref_audio = gr.Audio(
-                label="Giọng mẫu (reference)",
-                type="filepath",
-                value="model/samples/nu-luu-loat.wav",
-            )
-            norm_cb = gr.Checkbox(label="Chuẩn hóa văn bản", value=True)
-            # ✅ Đây là nút Predict
-            tts_button = gr.Button("🎙️ Tạo giọng nói", variant="primary")
-        with gr.Column(scale=1):
-            output_audio = gr.Audio(label="Kết quả âm thanh", autoplay=True)
-            output_info = gr.Textbox(label="Thông tin chi tiết", interactive=False)
-    tts_button.click(
-        predict,
-        inputs=[input_text, lang_dd, ref_audio, norm_cb],
-        outputs=[output_audio, output_info],
     )
-# ========== CHẠY SONG SONG FASTAPI + GRADIO ==========
-if __name__ == "__main__":
-    def run_api():
-        uvicorn.run(api_app, host="0.0.0.0", port=8000)
-    threading.Thread(target=run_api, daemon=True).start()
-    demo.queue()
-    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

 import os
 import torch
 import torchaudio
 import gradio as gr
 from huggingface_hub import snapshot_download, hf_hub_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
+# ========== LOAD MODEL ==========
 checkpoint_dir = "model/"
 repo_id = "capleaf/viXTTS"
 os.makedirs(checkpoint_dir, exist_ok=True)
+required = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
+if not all(x in os.listdir(checkpoint_dir) for x in required):
     snapshot_download(repo_id=repo_id, local_dir=checkpoint_dir)
+    hf_hub_download("coqui/XTTS-v2", "speakers_xtts.pth", local_dir=checkpoint_dir)
 config = XttsConfig()
+config.load_json(f"{checkpoint_dir}/config.json")
 MODEL = Xtts.init_from_config(config)
+MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=False)
+# Force CPU + optimize for CPU inference
+MODEL.cpu()
+MODEL.model_gpt.float()
+MODEL.vocoder.float()
+torch.set_num_threads(4)
+torch.backends.mkldnn.enabled = True
 # ========== TTS FUNCTION ==========
+def predict(text, ref_audio):
+    if not text:
+        return None, "⚠️ Nhập nội dung đi."
+    # extract voice features
+    gpt_latent, spk_embed = MODEL.get_conditioning_latents(
+        audio_path=ref_audio,
+        gpt_cond_len=18,   # ↓ giảm còn 18 → nhanh hơn ~30%
+        gpt_cond_chunk_len=4,
+        max_ref_length=50,
     )
+    out = MODEL.inference(
+        text,
+        "vi",
+        gpt_latent,
+        spk_embed,
+        enable_text_splitting=False,   # ✅ chạy nhanh hơn
+        temperature=0.7,
+        repetition_penalty=3.0,
+    )
+    wav = torch.tensor(out["wav"]).unsqueeze(0)
+    torchaudio.save("output.wav", wav, 24000)
+    return "output.wav", "✅ Xong rồi!"
+# ========== GRADIO UI (cũng là API) ==========
+with gr.Blocks() as demo:
+    gr.Markdown("### 🇻🇳 ViXTTS - CPU Optimized (HuggingFace)")
+    text_in = gr.Textbox(label="Văn bản", value="Xin chào, đây là giọng nói tiếng Việt.")
+    ref_in = gr.Audio(label="Giọng mẫu", type="filepath", value="model/samples/nu-luu-loat.wav")
+    speak_btn = gr.Button("🎙️ Tạo giọng")
+    audio_out = gr.Audio(label="Kết quả", autoplay=True)
+    info_out  = gr.Textbox(label="Trạng thái", interactive=False)
+    speak_btn.click(predict, inputs=[text_in, ref_in], outputs=[audio_out, info_out])
+demo.launch()