kt005

Running

App Files Files Community

ktvoice commited on 24 days ago

Commit

e16bdd9

verified ·

1 Parent(s): cd98ed3

Upload 3 files

Browse files

Files changed (3) hide show

app.py +41 -35
packages.txt +0 -1
tts_engine.py +33 -195

app.py CHANGED Viewed

@@ -7,16 +7,17 @@ import soundfile as sf
 import tempfile
 import torch
 import librosa
-from tts_engine import VoiceEngine
 import time
-# --- 1. SETUP MODEL (Sử dụng repo cá nhân của bạn) ---
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# THAY THẾ 'YOUR_USERNAME' bằng tên Hugging Face của bạn
 MY_BACKBONE_REPO = "ktvoice/Backbone"
 MY_CODEC_REPO = "ktvoice/Codec"
 try:
     tts = VoiceEngine(
         backbone_repo=MY_BACKBONE_REPO,
@@ -34,7 +35,7 @@ except Exception as e:
             return np.random.uniform(-0.1, 0.1, 24000*2)
     tts = MockTTS()
-# --- 2. DATA (Giữ nguyên danh sách giọng mẫu cục bộ) ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
@@ -48,7 +49,6 @@ VOICE_SAMPLES = {
     "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
 }
-# --- 3. HELPER FUNCTIONS ---
 def load_reference_info(voice_choice):
     if voice_choice in VOICE_SAMPLES:
         audio_path = VOICE_SAMPLES[voice_choice]["audio"]
@@ -65,7 +65,7 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
         if not text or text.strip() == "":
             return None, "⚠️ Vui lòng nhập nội dung!"
-        # Tiền xử lý văn bản để tăng độ nghỉ
         processed_text = text
         if pause_level == "Trung bình":
             processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
@@ -75,7 +75,7 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
         if len(processed_text) > 400:
              processed_text = processed_text[:400]
-        # Lấy Reference Data
         if mode_tab == "custom_mode":
             if custom_audio is None or not custom_text:
                 return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
@@ -86,12 +86,12 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
             with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
                 ref_text_raw = f.read()
-        # Inference
         start_time = time.time()
         ref_codes = tts.encode_reference(ref_audio_path)
         wav = tts.infer(processed_text, ref_codes, ref_text_raw)
-        # Điều chỉnh tốc độ (Time-stretching)
         if speed_value != 1.0:
             wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
@@ -101,11 +101,11 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
             sf.write(tmp_file.name, wav, 24000)
             output_path = tmp_file.name
-        return output_path, f"⚡ Thành công: {process_time:.2f}s | Tốc độ: {speed_value}x"
     except Exception as e:
         return None, f"❌ Lỗi: {str(e)}"
-# --- 4. THEME & CSS (Deep Night Pro) ---
 theme = gr.themes.Default(
     primary_hue="indigo",
     secondary_hue="blue",
@@ -121,11 +121,11 @@ theme = gr.themes.Default(
 )
 css = """
-.main-wrap { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
 .st-card {
     border-radius: 16px !important;
     border: 1px solid rgba(255,255,255,0.1) !important;
-    box-shadow: 0 4px 20px rgba(0,0,0,0.5) !important;
     padding: 15px;
 }
 .result-card {
@@ -134,60 +134,66 @@ css = """
     margin-top: 15px;
 }
 audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
-.footer { text-align: center; margin-top: 40px; color: #475569; font-size: 0.8rem; font-weight: 500; }
 """
-# --- 5. UI CONSTRUCTION ---
 with gr.Blocks(title="AI Voice Studio") as demo:
     with gr.Column(elem_classes="main-wrap"):
         with gr.Row(equal_height=True):
-            # TRÁI: Nhập văn bản
             with gr.Column(scale=1):
                 with gr.Group(elem_classes="st-card"):
                     text_input = gr.Textbox(
-                        label="VĂN BẢN CẦN CHUYỂN ĐỔI",
-                        placeholder="Chào mừng bạn. Hãy nhập nội dung vào đây...",
-                        lines=20,
                         show_label=True,
                     )
-                    char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-size: 0.85rem; font-weight: bold; padding: 5px;'>0 / 250</div>")
-            # PHẢI: Cấu hình
             with gr.Column(scale=1):
                 with gr.Tabs() as tabs:
-                    with gr.TabItem("👤 Nghệ sĩ đọc", id="preset_mode"):
                         voice_select = gr.Dropdown(
                             choices=list(VOICE_SAMPLES.keys()),
                             value="Tuyên (nam miền Bắc)",
-                            label="Chọn giọng đọc",
                         )
                         with gr.Accordion("Nghe thử giọng mẫu", open=False):
                             ref_audio_preview = gr.Audio(interactive=False, show_label=False)
                             ref_text_preview = gr.Markdown("...")
-                    with gr.TabItem("🎙️ Nhân bản (Clone)", id="custom_mode"):
-                        custom_audio = gr.Audio(label="Audio gốc", type="filepath")
-                        custom_text = gr.Textbox(label="NỘI DUNG AUDIO MẪU", lines=4)
-                # Cấu hình âm thanh chuyên nghiệp
                 with gr.Row():
                     pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
                     speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
                 current_mode = gr.State(value="preset_mode")
                 gr.Markdown("<br>")
-                btn_generate = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")
                 with gr.Group(elem_classes="st-card result-card"):
-                    audio_output = gr.Audio(label="KẾT QUẢ", interactive=False, autoplay=True)
-                    status_output = gr.Markdown("<p style='text-align: center; color: #818cf8;'>✨ Sẵn sàng</p>")
-        gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL AI SOLUTIONS 2025</div>")
     # LOGIC
-    text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-size: 0.85rem; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
-    voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
     tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
     tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
     btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])

 import tempfile
 import torch
 import librosa
 import time
+# IMPORT TỪ FILE ENGINE CỦA BẠN
+from tts_engine import VoiceEngine
+# CẤU HÌNH REPO CÁ NHÂN CỦA KTVOICE
 MY_BACKBONE_REPO = "ktvoice/Backbone"
 MY_CODEC_REPO = "ktvoice/Codec"
+device = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     tts = VoiceEngine(
         backbone_repo=MY_BACKBONE_REPO,
             return np.random.uniform(-0.1, 0.1, 24000*2)
     tts = MockTTS()
+# --- DATA GIỌNG MẪU ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
     "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
 }
 def load_reference_info(voice_choice):
     if voice_choice in VOICE_SAMPLES:
         audio_path = VOICE_SAMPLES[voice_choice]["audio"]
         if not text or text.strip() == "":
             return None, "⚠️ Vui lòng nhập nội dung!"
+        # Xử lý độ ngắt nghỉ (Pause level)
         processed_text = text
         if pause_level == "Trung bình":
             processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
         if len(processed_text) > 400:
              processed_text = processed_text[:400]
+        # Lấy dữ liệu Reference
         if mode_tab == "custom_mode":
             if custom_audio is None or not custom_text:
                 return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
             with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
                 ref_text_raw = f.read()
+        # Thực hiện Inference
         start_time = time.time()
         ref_codes = tts.encode_reference(ref_audio_path)
         wav = tts.infer(processed_text, ref_codes, ref_text_raw)
+        # Điều chỉnh Tốc độ
         if speed_value != 1.0:
             wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
             sf.write(tmp_file.name, wav, 24000)
             output_path = tmp_file.name
+        return output_path, f"⚡ Hoàn tất: {process_time:.2f}s | Tốc độ: {speed_value}x"
     except Exception as e:
         return None, f"❌ Lỗi: {str(e)}"
+# --- UI SETUP (Premium Dark Mode) ---
 theme = gr.themes.Default(
     primary_hue="indigo",
     secondary_hue="blue",
 )
 css = """
+.main-wrap { max-width: 1240px !important; margin: auto !important; padding: 30px 20px !important; }
 .st-card {
     border-radius: 16px !important;
     border: 1px solid rgba(255,255,255,0.1) !important;
+    box-shadow: 0 4px 25px rgba(0,0,0,0.6) !important;
     padding: 15px;
 }
 .result-card {
     margin-top: 15px;
 }
 audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
+.footer { text-align: center; margin-top: 50px; color: #475569; font-size: 0.85rem; letter-spacing: 1px; }
 """
 with gr.Blocks(title="AI Voice Studio") as demo:
     with gr.Column(elem_classes="main-wrap"):
+        # Đã xóa phần Header "VieNeu Studio" theo yêu cầu
         with gr.Row(equal_height=True):
+            # CỘT TRÁI: NHẬP VĂN BẢN
             with gr.Column(scale=1):
                 with gr.Group(elem_classes="st-card"):
                     text_input = gr.Textbox(
+                        label="VĂN BẢN ĐẦU VÀO",
+                        placeholder="Nhập nội dung cần chuyển đổi giọng nói...",
+                        lines=24, # Tăng số dòng để cân bằng với cột phải
                         show_label=True,
                     )
+                    char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-weight: bold; padding: 5px;'>0 / 250</div>")
+            # CỘT PHẢI: CẤU HÌNH
             with gr.Column(scale=1):
                 with gr.Tabs() as tabs:
+                    with gr.TabItem("👤 Giọng Mẫu", id="preset_mode"):
                         voice_select = gr.Dropdown(
                             choices=list(VOICE_SAMPLES.keys()),
                             value="Tuyên (nam miền Bắc)",
+                            label="Lựa chọn nghệ sĩ",
                         )
                         with gr.Accordion("Nghe thử giọng mẫu", open=False):
                             ref_audio_preview = gr.Audio(interactive=False, show_label=False)
                             ref_text_preview = gr.Markdown("...")
+                    with gr.TabItem("🎙️ Tự Clone", id="custom_mode"):
+                        gr.Markdown("<p style='color: #94a3b8; font-size: 0.85rem; margin-bottom: 5px;'>Tải lên audio nguồn để hệ thống mô phỏng giọng nói.</p>")
+                        custom_audio = gr.Audio(label="Audio mẫu (.wav/mp3)", type="filepath")
+                        # Ô nội dung mẫu được làm rộng hơn (lines=6)
+                        custom_text = gr.Textbox(
+                            label="NỘI DUNG AUDIO MẪU",
+                            placeholder="Nhập chính xác lời thoại của audio mẫu để AI học nhịp điệu...",
+                            lines=6,
+                            show_label=True
+                        )
                 with gr.Row():
                     pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
                     speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
                 current_mode = gr.State(value="preset_mode")
                 gr.Markdown("<br>")
+                btn_generate = gr.Button("TỔNG HỢP GIỌNG NÓI", variant="primary", size="lg")
                 with gr.Group(elem_classes="st-card result-card"):
+                    audio_output = gr.Audio(label="KẾT QUẢ ÂM THANH", interactive=False, autoplay=True)
+                    status_output = gr.Markdown("<p style='text-align: center; color: #818cf8; font-weight: 500;'>✨ Hệ thống sẵn sàng thực hiện</p>")
+        gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL STUDIO EDITION 2025</div>")
     # LOGIC
+    text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
+    voice_select.change(lambda v: load_reference_info(v), voice_select, [ref_audio_preview, ref_text_preview])
     tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
     tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
     btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])

packages.txt CHANGED Viewed

@@ -1,3 +1,2 @@
 espeak-ng
-libespeak-ng1
 ffmpeg


1	espeak-ng

2	ffmpeg

tts_engine.py CHANGED Viewed

@@ -36,9 +36,9 @@ def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
 class VoiceEngine:
     def __init__(
         self,
-        backbone_repo="pnnbao-ump/VieNeu-TTS",
         backbone_device="cpu",
-        codec_repo="neuphonic/neucodec",
         codec_device="cpu",
     ):
@@ -52,14 +52,13 @@ class VoiceEngine:
         self.streaming_lookback = 50
         self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
-        # ggml & onnx flags
         self._is_quantized_model = False
         self._is_onnx_codec = False
-        # HF tokenizer
         self.tokenizer = None
-        # Load models
         self._load_backbone(backbone_repo, backbone_device)
         self._load_codec(codec_repo, codec_device)
@@ -70,11 +69,7 @@ class VoiceEngine:
             try:
                 from llama_cpp import Llama
             except ImportError as e:
-                raise ImportError(
-                    "Failed to import `llama_cpp`. "
-                    "Please install it with:\n"
-                    "    pip install llama-cpp-python"
-                ) from e
             self.backbone = Llama.from_pretrained(
                 repo_id=backbone_repo,
                 filename="*.gguf",
@@ -85,7 +80,6 @@ class VoiceEngine:
                 flash_attn=True if backbone_device == "gpu" else False,
             )
             self._is_quantized_model = True
         else:
             self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
             self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
@@ -94,98 +88,55 @@ class VoiceEngine:
     def _load_codec(self, codec_repo, codec_device):
         print(f"Loading codec from: {codec_repo} on {codec_device} ...")
-        match codec_repo:
-            case "neuphonic/neucodec":
-                self.codec = NeuCodec.from_pretrained(codec_repo)
-                self.codec.eval().to(codec_device)
-            case "neuphonic/distill-neucodec":
-                self.codec = DistillNeuCodec.from_pretrained(codec_repo)
-                self.codec.eval().to(codec_device)
-            case "neuphonic/neucodec-onnx-decoder":
-                if codec_device != "cpu":
-                    raise ValueError("Onnx decoder only currently runs on CPU.")
-                try:
-                    from neucodec import NeuCodecOnnxDecoder
-                except ImportError as e:
-                    raise ImportError(
-                        "Failed to import the onnx decoder."
-                        " Ensure you have onnxruntime installed as well as neucodec >= 0.0.4."
-                    ) from e
-                self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
-                self._is_onnx_codec = True
-            case _:
-                raise ValueError(f"Unsupported codec repository: {codec_repo}")
     def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
-        """
-        Perform inference to generate speech from text using the TTS model and reference audio.
-        Args:
-            text (str): Input text to be converted to speech.
-            ref_codes (np.ndarray | torch.tensor): Encoded reference.
-            ref_text (str): Reference text for reference audio. Defaults to None.
-        Returns:
-            np.ndarray: Generated speech waveform.
-        """
-        # Generate tokens
         if self._is_quantized_model:
             output_str = self._infer_ggml(ref_codes, ref_text, text)
         else:
             prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
             output_str = self._infer_torch(prompt_ids)
-        # Decode
         wav = self._decode(output_str)
         return wav
-    def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
-        """
-        Perform streaming inference to generate speech from text using the TTS model and reference audio.
-        Args:
-            text (str): Input text to be converted to speech.
-            ref_codes (np.ndarray | torch.tensor): Encoded reference.
-            ref_text (str): Reference text for reference audio. Defaults to None.
-        Yields:
-            np.ndarray: Generated speech waveform.
-        """
-        if self._is_quantized_model:
-            return self._infer_stream_ggml(ref_codes, ref_text, text)
-        else:
-            raise NotImplementedError("Streaming is not implemented for the torch backend!")
     def encode_reference(self, ref_audio_path: str | Path):
         wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
-        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)  # [1, 1, T]
         with torch.no_grad():
             ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
         return ref_codes
     def _decode(self, codes: str):
-        """Decode speech tokens to audio waveform."""
-        # Extract speech token IDs using regex
         speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
         if len(speech_ids) == 0:
-            raise ValueError(
-                "No valid speech tokens found in the output. "
-                "The model may not have generated proper speech tokens."
-            )
-        # Onnx decode
         if self._is_onnx_codec:
-            codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
-            recon = self.codec.decode_code(codes)
-        # Torch decode
         else:
             with torch.no_grad():
-                codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
-                    self.codec.device
-                )
-                recon = self.codec.decode_code(codes).cpu().numpy()
         return recon[0, 0, :]
@@ -199,17 +150,11 @@ class VoiceEngine:
         text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
         input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
-        chat = """user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"""
         ids = self.tokenizer.encode(chat)
         text_replace_idx = ids.index(text_replace)
-        ids = (
-            ids[:text_replace_idx]
-            + [text_prompt_start]
-            + input_ids
-            + [text_prompt_end]
-            + ids[text_replace_idx + 1 :]  # noqa
-        )
         speech_replace_idx = ids.index(speech_replace)
         codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
@@ -236,111 +181,4 @@ class VoiceEngine:
         output_str = self.tokenizer.decode(
             output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
         )
-        return output_str
-    def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
-        ref_text = phonemize_with_dict(ref_text)
-        input_text = phonemize_with_dict(input_text)
-        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
-        prompt = (
-            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
-            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
-        )
-        output = self.backbone(
-            prompt,
-            max_tokens=self.max_context,
-            temperature=1.0,
-            top_k=50,
-            stop=["<|SPEECH_GENERATION_END|>"],
-        )
-        output_str = output["choices"][0]["text"]
-        return output_str
-    def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
-        ref_text = phonemize_with_dict(ref_text)
-        input_text = phonemize_with_dict(input_text)
-        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
-        prompt = (
-            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
-            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
-        )
-        audio_cache: list[np.ndarray] = []
-        token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
-        n_decoded_samples: int = 0
-        n_decoded_tokens: int = len(ref_codes)
-        for item in self.backbone(
-            prompt,
-            max_tokens=self.max_context,
-            temperature=0.2,
-            top_k=50,
-            stop=["<|SPEECH_GENERATION_END|>"],
-            stream=True
-        ):
-            output_str = item["choices"][0]["text"]
-            token_cache.append(output_str)
-            if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
-                # decode chunk
-                tokens_start = max(
-                    n_decoded_tokens
-                    - self.streaming_lookback
-                    - self.streaming_overlap_frames,
-                    0
-                )
-                tokens_end = (
-                    n_decoded_tokens
-                    + self.streaming_frames_per_chunk
-                    + self.streaming_lookforward
-                    + self.streaming_overlap_frames
-                )
-                sample_start = (
-                    n_decoded_tokens - tokens_start
-                ) * self.hop_length
-                sample_end = (
-                    sample_start
-                    + (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
-                )
-                curr_codes = token_cache[tokens_start:tokens_end]
-                recon = self._decode("".join(curr_codes))
-                recon = recon[sample_start:sample_end]
-                audio_cache.append(recon)
-                # postprocess
-                processed_recon = _linear_overlap_add(
-                    audio_cache, stride=self.streaming_stride_samples
-                )
-                new_samples_end = len(audio_cache) * self.streaming_stride_samples
-                processed_recon = processed_recon[
-                    n_decoded_samples:new_samples_end
-                ]
-                n_decoded_samples = new_samples_end
-                n_decoded_tokens += self.streaming_frames_per_chunk
-                yield processed_recon
-        # final decoding handled separately as non-constant chunk size
-        remaining_tokens = len(token_cache) - n_decoded_tokens
-        if len(token_cache) > n_decoded_tokens:
-            tokens_start = max(
-                len(token_cache)
-                - (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
-                0
-            )
-            sample_start = (
-                len(token_cache)
-                - tokens_start
-                - remaining_tokens
-                - self.streaming_overlap_frames
-            ) * self.hop_length
-            curr_codes = token_cache[tokens_start:]
-            recon = self._decode("".join(curr_codes))
-            recon = recon[sample_start:]
-            audio_cache.append(recon)
-            processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
-            processed_recon = processed_recon[n_decoded_samples:]
-            yield processed_recon

 class VoiceEngine:
     def __init__(
         self,
+        backbone_repo="ktvoice/Backbone", # Thiết lập mặc định về repo của bạn
         backbone_device="cpu",
+        codec_repo="ktvoice/Codec",       # Thiết lập mặc định về repo của bạn
         codec_device="cpu",
     ):
         self.streaming_lookback = 50
         self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
+        # Flags
         self._is_quantized_model = False
         self._is_onnx_codec = False
         self.tokenizer = None
+        # Khởi tạo mô hình
         self._load_backbone(backbone_repo, backbone_device)
         self._load_codec(codec_repo, codec_device)
             try:
                 from llama_cpp import Llama
             except ImportError as e:
+                raise ImportError("Vui lòng cài đặt llama-cpp-python để dùng model GGUF.") from e
             self.backbone = Llama.from_pretrained(
                 repo_id=backbone_repo,
                 filename="*.gguf",
                 flash_attn=True if backbone_device == "gpu" else False,
             )
             self._is_quantized_model = True
         else:
             self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
             self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
     def _load_codec(self, codec_repo, codec_device):
         print(f"Loading codec from: {codec_repo} on {codec_device} ...")
+        # Cập nhật logic load codec linh hoạt hơn để chấp nhận repo ktvoice/Codec
+        codec_repo_lower = codec_repo.lower()
+        if "distill" in codec_repo_lower:
+            self.codec = DistillNeuCodec.from_pretrained(codec_repo)
+        elif "onnx" in codec_repo_lower:
+            try:
+                from neucodec import NeuCodecOnnxDecoder
+            except ImportError as e:
+                raise ImportError("Vui lòng cài đặt onnxruntime và neucodec >= 0.0.4.") from e
+            self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
+            self._is_onnx_codec = True
+        else:
+            # Mặc định load NeuCodec (phù hợp với repository ktvoice/Codec của bạn)
+            self.codec = NeuCodec.from_pretrained(codec_repo)
+        if not self._is_onnx_codec:
+            self.codec.eval().to(codec_device)
     def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
         if self._is_quantized_model:
             output_str = self._infer_ggml(ref_codes, ref_text, text)
         else:
             prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
             output_str = self._infer_torch(prompt_ids)
         wav = self._decode(output_str)
         return wav
     def encode_reference(self, ref_audio_path: str | Path):
         wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
         with torch.no_grad():
             ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
         return ref_codes
     def _decode(self, codes: str):
         speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
         if len(speech_ids) == 0:
+            raise ValueError("Hệ thống không tạo được token speech hợp lệ.")
         if self._is_onnx_codec:
+            codes_np = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
+            recon = self.codec.decode_code(codes_np)
         else:
             with torch.no_grad():
+                codes_tensor = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(self.codec.device)
+                recon = self.codec.decode_code(codes_tensor).cpu().numpy()
         return recon[0, 0, :]
         text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
         input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
+        chat = "user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"
         ids = self.tokenizer.encode(chat)
         text_replace_idx = ids.index(text_replace)
+        ids = ids[:text_replace_idx] + [text_prompt_start] + input_ids + [text_prompt_end] + ids[text_replace_idx + 1 :]
         speech_replace_idx = ids.index(speech_replace)
         codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
         output_str = self.tokenizer.decode(
             output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
         )
+        return output_str