kt007

Running

App Files Files Community

ktvoice commited on Dec 28, 2025

Commit

002e5d4

verified ·

1 Parent(s): e58b011

Upload 2 files

Browse files

Files changed (2) hide show

app.py +45 -38
tts_engine.py +31 -42

app.py CHANGED Viewed

@@ -2,12 +2,11 @@ import spaces, os, gradio as gr, soundfile as sf, tempfile, torch, librosa, time
 os.environ['SPACES_ZERO_GPU'] = '1'
 from tts_engine import VoiceEngine
-# --- 1. SETUP MODEL (GIỮ LOGIC TỰ ĐỘNG CỦA TÁC GIẢ) ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     tts = VoiceEngine(backbone_repo="ktvoice/Backbone", backbone_device=device, codec_repo="ktvoice/Codec", codec_device=device)
-except Exception as e:
-    print(f"⚠️ Lỗi: {e}")
     tts = None
 VOICE_SAMPLES = {
@@ -23,62 +22,70 @@ VOICE_SAMPLES = {
     "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
 }
-def load_ref(choice):
-    if choice in VOICE_SAMPLES:
-        with open(VOICE_SAMPLES[choice]["text"], "r", encoding="utf-8") as f:
-            return VOICE_SAMPLES[choice]["audio"], f.read()
-    return None, ""
 @spaces.GPU(duration=120)
-def process_tts(text, voice, c_audio, c_text, mode, pause, speed):
-    if not tts: return None, "❌ Lỗi khởi tạo mô hình!"
     try:
-        ref_path, ref_txt = (c_audio, c_text) if mode == "custom" else (VOICE_SAMPLES[voice]["audio"], open(VOICE_SAMPLES[voice]["text"], "r", encoding="utf-8").read())
         start = time.time()
         codes = tts.encode_reference(ref_path)
         wav = tts.infer(text[:400], codes, ref_txt)
         if speed != 1.0: wav = librosa.effects.time_stretch(wav, rate=float(speed))
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
             sf.write(tmp.name, wav, 24000)
-            return tmp.name, f"✅ Hoàn tất ({time.time()-start:.2f}s)"
     except Exception as e: return None, f"❌ Lỗi: {str(e)}"
-# --- UI ---
-theme = gr.themes.Default(primary_hue="indigo", neutral_hue="slate", font=[gr.themes.GoogleFont('Inter'), 'sans-serif']).set(
-    body_background_fill="#020617", block_background_fill="#0f172a",
-    input_background_fill="#1e293b", button_primary_background_fill="linear-gradient(135deg, #6366f1 0%, #a855f7 100%)",
 )
-css = ".main-wrap { max-width: 1280px !important; margin: auto !important; padding: 20px !important; } .st-card { border-radius: 12px !important; border: 1px solid rgba(255,255,255,0.1) !important; padding: 25px !important; background: #0f172a !important; } * { font-family: 'Inter', sans-serif !important; } label span { font-weight: 700 !important; color: #818cf8 !important; text-transform: uppercase; font-size: 0.75rem !important; } .footer { text-align: center; margin-top: 50px; color: #475569; font-size: 0.8rem; }"
-with gr.Blocks(title="AI Studio") as demo:
-    with gr.Column(elem_classes="main-wrap"):
-        with gr.Row():
             with gr.Column(scale=1):
-                with gr.Group(elem_classes="st-card"):
-                    txt = gr.Textbox(label="VĂN BẢN ĐẦU VÀO", lines=20, placeholder="Nhập nội dung...")
-                    gr.HTML("<div style='text-align: right; color: #6366f1; font-weight: 700;'>0 / 250</div>")
             with gr.Column(scale=1):
-                with gr.Tabs() as ts:
                     with gr.TabItem("👤 Giọng Nghệ Sĩ", id="preset"):
                         v_sel = gr.Dropdown(choices=list(VOICE_SAMPLES.keys()), value="Tuyên (nam miền Bắc)", label="Chọn nghệ sĩ")
-                        with gr.Accordion("Nghe thử", open=False): rp, rt = gr.Audio(interactive=False), gr.Markdown()
-                    with gr.TabItem("🎙️ Tự Nhân Bản", id="custom"):
                         ca = gr.Audio(label="Audio gốc", type="filepath")
-                        ct = gr.Textbox(label="Nội dung audio mẫu", lines=5)
                 with gr.Row():
                     pl = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Ngắt nghỉ")
                     sv = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ")
-                md = gr.State("preset")
-                btn = gr.Button("TẠO GIỌNG NÓI NGAY", variant="primary", size="lg")
-                with gr.Group(elem_classes="st-card"):
-                    ao, st = gr.Audio(label="KẾT QUẢ", interactive=False, autoplay=True), gr.Markdown("<p style='text-align: center; color: #6366f1;'>Sẵn sàng</p>")
-        gr.HTML("<div class='footer'>AI VOICE ENGINE • PROFESSIONAL STUDIO 2025</div>")
-    v_sel.change(load_ref, v_sel, [rp, rt])
-    ts.children[0].select(lambda: "preset", None, md)
-    ts.children[1].select(lambda: "custom", None, md)
-    btn.click(process_tts, [txt, v_sel, ca, ct, md, pl, sv], [ao, st])
 if __name__ == "__main__":
-    demo.queue().launch(theme=theme, css=css, server_name="0.0.0.0", server_port=7860)

 os.environ['SPACES_ZERO_GPU'] = '1'
 from tts_engine import VoiceEngine
+# --- SETUP ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     tts = VoiceEngine(backbone_repo="ktvoice/Backbone", backbone_device=device, codec_repo="ktvoice/Codec", codec_device=device)
+except Exception:
     tts = None
 VOICE_SAMPLES = {
     "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
 }
 @spaces.GPU(duration=120)
+def run_tts(text, voice, c_audio, c_text, mode, pause, speed):
+    if not tts: return None, "❌ Lỗi hệ thống"
     try:
         start = time.time()
+        ref_path, ref_txt = (c_audio, c_text) if mode == "custom" else (VOICE_SAMPLES[voice]["audio"], open(VOICE_SAMPLES[voice]["text"], "r", encoding="utf-8").read())
         codes = tts.encode_reference(ref_path)
         wav = tts.infer(text[:400], codes, ref_txt)
         if speed != 1.0: wav = librosa.effects.time_stretch(wav, rate=float(speed))
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
             sf.write(tmp.name, wav, 24000)
+            return tmp.name, f"✅ Thành công ({time.time()-start:.2f}s)"
     except Exception as e: return None, f"❌ Lỗi: {str(e)}"
+# --- CẤU HÌNH GIAO DIỆN (TONE XANH ĐẬM STUDIO) ---
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont('Roboto'), 'sans-serif']
+).set(
+    body_background_fill="#0f172a",
+    block_background_fill="#1e293b",
+    input_background_fill="#334155",
+    button_primary_background_fill="linear-gradient(135deg, #2563eb 0%, #0891b2 100%)", # Chuyển sang xanh biển/cyan
+    button_primary_text_color="white",
+    block_label_text_size="0.85rem",
+    block_title_text_weight="600",
 )
+css = """
+.main-container { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
+.card-box { border-radius: 12px !important; border: 1px solid rgba(255,255,255,0.05) !important; padding: 20px; background: #1e293b !important; }
+textarea, input, span, label { line-height: 1.6 !important; } /* Sửa lỗi font đè chữ */
+.footer-text { text-align: center; margin-top: 40px; color: #64748b; font-size: 0.8rem; letter-spacing: 1px; }
+"""
+with gr.Blocks(theme=theme, css=css, title="AI Studio") as demo:
+    with gr.Column(elem_classes="main-container"):
+        with gr.Row(equal_height=True):
             with gr.Column(scale=1):
+                with gr.Group(elem_classes="card-box"):
+                    t_in = gr.Textbox(label="VĂN BẢN ĐẦU VÀO", lines=20, placeholder="Nhập văn bản cần chuyển giọng...")
+                    t_cnt = gr.HTML("<div style='text-align: right; color: #38bdf8; font-weight: 700; padding: 5px;'>0 / 250</div>")
             with gr.Column(scale=1):
+                with gr.Tabs() as tabs:
                     with gr.TabItem("👤 Giọng Nghệ Sĩ", id="preset"):
                         v_sel = gr.Dropdown(choices=list(VOICE_SAMPLES.keys()), value="Tuyên (nam miền Bắc)", label="Chọn nghệ sĩ")
+                    with gr.TabItem("🎙️ Nhân Bản Giọng", id="custom"):
                         ca = gr.Audio(label="Audio gốc", type="filepath")
+                        ct = gr.Textbox(label="Lời thoại audio gốc", lines=4)
                 with gr.Row():
                     pl = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Ngắt nghỉ")
                     sv = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ")
+                m_state = gr.State("preset")
+                btn = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")
+                with gr.Group(elem_classes="card-box"):
+                    ao = gr.Audio(label="KẾT QUẢ ÂM THANH", interactive=False, autoplay=True)
+                    st = gr.Markdown("<p style='text-align: center; color: #38bdf8;'>Hệ thống sẵn sàng</p>")
+        gr.HTML("<div class='footer-text'>POWERED BY KTVOICE STUDIO • 2025</div>")
+    t_in.change(lambda t: f"<div style='text-align: right; color: {'#38bdf8' if len(t)<=250 else '#f43f5e'}; font-weight: 700; padding: 5px;'>{len(t)} / 250</div>", t_in, t_cnt)
+    tabs.children[0].select(lambda: "preset", None, m_state)
+    tabs.children[1].select(lambda: "custom", None, m_state)
+    btn.click(run_tts, [t_in, v_sel, ca, ct, m_state, pl, sv], [ao, st])
 if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860)

tts_engine.py CHANGED Viewed

@@ -4,37 +4,37 @@ import time
 import torch
 import librosa
 import numpy as np
 from pathlib import Path
 from typing import Generator
 from huggingface_hub import snapshot_download
-# --- BẢN VÁ (PATCH) ĐỂ CHẠY ĐƯỢC VỚI REPO CÁ NHÂN KTVOICE ---
 import neucodec.model
-import json
-# Lưu lại hàm gốc của thư viện neucodec
-_orig_from_pretrained = neucodec.model.NeuCodec._from_pretrained
-@classmethod
-def _patched_from_pretrained(cls, model_id, *args, **kwargs):
-    """
-    Bản vá này giúp vượt qua lệnh assert model_id in [...] của thư viện neucodec.
-    Nó cho phép nạp mô hình từ bất kỳ repo nào (như ktvoice/Codec).
-    """
-    # Nếu model_id là một đường dẫn local hoặc repo cá nhân,
-    # chúng ta "đánh lừa" thư viện bằng cách dùng tên repo chính thức để qua cửa assert.
-    valid_ids = ["neuphonic/neucodec", "neuphonic/distill-neucodec"]
-    check_id = model_id
-    if model_id not in valid_ids:
-        check_id = "neuphonic/neucodec"
-    # Thực hiện nạp mô hình (Lệnh assert sẽ kiểm tra check_id thay vì model_id của bạn)
-    return _orig_from_pretrained(check_id, *args, **kwargs)
-# Áp dụng bản vá vào cả hai lớp của thư viện neucodec
-neucodec.model.NeuCodec._from_pretrained = _patched_from_pretrained
-neucodec.model.DistillNeuCodec._from_pretrained = _patched_from_pretrained
-# -----------------------------------------------------------
 from neucodec import NeuCodec, DistillNeuCodec
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -61,11 +61,8 @@ class VoiceEngine:
     def __init__(self, backbone_repo="ktvoice/Backbone", backbone_device="cpu", codec_repo="ktvoice/Codec", codec_device="cpu"):
         self.sample_rate = 24_000
         self.max_context = 2048
-        self.hop_length = 480
         self._is_quantized_model = False
-        self._is_onnx_codec = False
         self.tokenizer = None
         self._load_backbone(backbone_repo, backbone_device)
         self._load_codec(codec_repo, codec_device)
@@ -80,19 +77,16 @@ class VoiceEngine:
             self.backbone = AutoModelForCausalLM.from_pretrained(repo).to(torch.device(device))
     def _load_codec(self, repo, device):
-        print(f"Loading codec from: {repo} on {device} ...")
-        # Tải mô hình về thư mục tạm
         local_dir = snapshot_download(repo_id=repo)
-        # GIẢI THÍCH: Tại sao cần tạo config.json giả?
-        # Thư viện neucodec mặc định tìm config.json khi repo name không phải là 'neuphonic/neucodec'.
-        # Chúng ta tạo một file config.json tối giản trong thư mục snapshot để đánh lừa nó.
-        config_path = os.path.join(local_dir, "config.json")
-        if not os.path.exists(config_path):
-            with open(config_path, "w") as f:
-                json.dump({"model_type": "neucodec"}, f)
-        # Nạp mô hình từ đường dẫn cục bộ
         if "distill" in repo.lower():
             self.codec = DistillNeuCodec.from_pretrained(local_dir)
         else:
@@ -106,13 +100,8 @@ class VoiceEngine:
             return self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
     def infer(self, text, ref_codes, ref_text):
-        if self._is_quantized_model:
-            # Placeholder cho logic GGUF nếu bạn cần
-            return np.zeros(48000)
         prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
         prompt_tensor = torch.tensor(prompt_ids).unsqueeze(0).to(self.backbone.device)
         with torch.no_grad():
             out = self.backbone.generate(prompt_tensor, max_length=self.max_context, do_sample=True, temperature=1)

 import torch
 import librosa
 import numpy as np
+import json
 from pathlib import Path
 from typing import Generator
 from huggingface_hub import snapshot_download
+# --- BẢN VÁ CAO CẤP (MONKEY PATCH) ĐỂ CHẠY REPO KTVOICE ---
 import neucodec.model
+def _apply_robust_patch(target_cls):
+    """Vá lỗi AssertionError và TypeError cho thư viện neucodec"""
+    orig_func = target_cls._from_pretrained
+    @classmethod
+    def _patched_func(cls, *args, **kwargs):
+        # Đảm bảo model_id luôn hợp lệ để thoả mãn lệnh assert của thư viện
+        official_id = "neuphonic/distill-neucodec" if "distill" in str(cls).lower() else "neuphonic/neucodec"
+        # Sửa lỗi: HubMixin truyền model_id ở vị trí đầu tiên
+        if args:
+            kwargs["model_id"] = official_id
+            return orig_func(*args[1:], **kwargs)
+        else:
+            kwargs["model_id"] = official_id
+            return orig_func(**kwargs)
+    target_cls._from_pretrained = _patched_func
+# Áp dụng cho cả 2 lớp của neucodec
+_apply_robust_patch(neucodec.model.NeuCodec)
+_apply_robust_patch(neucodec.model.DistillNeuCodec)
+# -------------------------------------------------------
 from neucodec import NeuCodec, DistillNeuCodec
 from transformers import AutoTokenizer, AutoModelForCausalLM
     def __init__(self, backbone_repo="ktvoice/Backbone", backbone_device="cpu", codec_repo="ktvoice/Codec", codec_device="cpu"):
         self.sample_rate = 24_000
         self.max_context = 2048
         self._is_quantized_model = False
         self.tokenizer = None
         self._load_backbone(backbone_repo, backbone_device)
         self._load_codec(codec_repo, codec_device)
             self.backbone = AutoModelForCausalLM.from_pretrained(repo).to(torch.device(device))
     def _load_codec(self, repo, device):
+        print(f"Loading codec from your repo: {repo} ...")
+        # Tải trọng số từ repo ktvoice của bạn
         local_dir = snapshot_download(repo_id=repo)
+        # Tạo file cấu hình tạm thời để tránh lỗi "config.json not found"
+        # File này chỉ dùng để kích hoạt trình nạp của Hugging Face
+        tmp_config = os.path.join(local_dir, "config.json")
+        if not os.path.exists(tmp_config):
+            with open(tmp_config, "w") as f: json.dump({"model_type": "neucodec"}, f)
         if "distill" in repo.lower():
             self.codec = DistillNeuCodec.from_pretrained(local_dir)
         else:
             return self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
     def infer(self, text, ref_codes, ref_text):
         prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
         prompt_tensor = torch.tensor(prompt_ids).unsqueeze(0).to(self.backbone.device)
         with torch.no_grad():
             out = self.backbone.generate(prompt_tensor, max_length=self.max_context, do_sample=True, temperature=1)