ktvoice commited on
Commit
e16bdd9
·
verified ·
1 Parent(s): cd98ed3

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +41 -35
  2. packages.txt +0 -1
  3. tts_engine.py +33 -195
app.py CHANGED
@@ -7,16 +7,17 @@ import soundfile as sf
7
  import tempfile
8
  import torch
9
  import librosa
10
- from tts_engine import VoiceEngine
11
  import time
12
 
13
- # --- 1. SETUP MODEL (Sử dụng repo cá nhân của bạn) ---
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
- # THAY THẾ 'YOUR_USERNAME' bằng tên Hugging Face của bạn
17
  MY_BACKBONE_REPO = "ktvoice/Backbone"
18
  MY_CODEC_REPO = "ktvoice/Codec"
19
 
 
 
20
  try:
21
  tts = VoiceEngine(
22
  backbone_repo=MY_BACKBONE_REPO,
@@ -34,7 +35,7 @@ except Exception as e:
34
  return np.random.uniform(-0.1, 0.1, 24000*2)
35
  tts = MockTTS()
36
 
37
- # --- 2. DATA (Giữ nguyên danh sách giọng mẫu cục bộ) ---
38
  VOICE_SAMPLES = {
39
  "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
40
  "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
@@ -48,7 +49,6 @@ VOICE_SAMPLES = {
48
  "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
49
  }
50
 
51
- # --- 3. HELPER FUNCTIONS ---
52
  def load_reference_info(voice_choice):
53
  if voice_choice in VOICE_SAMPLES:
54
  audio_path = VOICE_SAMPLES[voice_choice]["audio"]
@@ -65,7 +65,7 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
65
  if not text or text.strip() == "":
66
  return None, "⚠️ Vui lòng nhập nội dung!"
67
 
68
- # Tiền xử văn bản để tăng độ nghỉ
69
  processed_text = text
70
  if pause_level == "Trung bình":
71
  processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
@@ -75,7 +75,7 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
75
  if len(processed_text) > 400:
76
  processed_text = processed_text[:400]
77
 
78
- # Lấy Reference Data
79
  if mode_tab == "custom_mode":
80
  if custom_audio is None or not custom_text:
81
  return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
@@ -86,12 +86,12 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
86
  with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
87
  ref_text_raw = f.read()
88
 
89
- # Inference
90
  start_time = time.time()
91
  ref_codes = tts.encode_reference(ref_audio_path)
92
  wav = tts.infer(processed_text, ref_codes, ref_text_raw)
93
 
94
- # Điều chỉnh tốc độ (Time-stretching)
95
  if speed_value != 1.0:
96
  wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
97
 
@@ -101,11 +101,11 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
101
  sf.write(tmp_file.name, wav, 24000)
102
  output_path = tmp_file.name
103
 
104
- return output_path, f"⚡ Thành công: {process_time:.2f}s | Tốc độ: {speed_value}x"
105
  except Exception as e:
106
  return None, f"❌ Lỗi: {str(e)}"
107
 
108
- # --- 4. THEME & CSS (Deep Night Pro) ---
109
  theme = gr.themes.Default(
110
  primary_hue="indigo",
111
  secondary_hue="blue",
@@ -121,11 +121,11 @@ theme = gr.themes.Default(
121
  )
122
 
123
  css = """
124
- .main-wrap { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
125
  .st-card {
126
  border-radius: 16px !important;
127
  border: 1px solid rgba(255,255,255,0.1) !important;
128
- box-shadow: 0 4px 20px rgba(0,0,0,0.5) !important;
129
  padding: 15px;
130
  }
131
  .result-card {
@@ -134,60 +134,66 @@ css = """
134
  margin-top: 15px;
135
  }
136
  audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
137
- .footer { text-align: center; margin-top: 40px; color: #475569; font-size: 0.8rem; font-weight: 500; }
138
  """
139
 
140
- # --- 5. UI CONSTRUCTION ---
141
  with gr.Blocks(title="AI Voice Studio") as demo:
142
-
143
  with gr.Column(elem_classes="main-wrap"):
 
 
144
  with gr.Row(equal_height=True):
145
- # TRÁI: Nhập văn bản
146
  with gr.Column(scale=1):
147
  with gr.Group(elem_classes="st-card"):
148
  text_input = gr.Textbox(
149
- label="VĂN BẢN CẦN CHUYỂN ĐỔI",
150
- placeholder="Chào mừng bạn. Hãy nhập nội dung vào đây...",
151
- lines=20,
152
  show_label=True,
153
  )
154
- char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-size: 0.85rem; font-weight: bold; padding: 5px;'>0 / 250</div>")
155
 
156
- # PHẢI: Cấu hình
157
  with gr.Column(scale=1):
158
  with gr.Tabs() as tabs:
159
- with gr.TabItem("👤 Nghệ sĩ đọc", id="preset_mode"):
160
  voice_select = gr.Dropdown(
161
  choices=list(VOICE_SAMPLES.keys()),
162
  value="Tuyên (nam miền Bắc)",
163
- label="Chọn giọng đọc",
164
  )
165
  with gr.Accordion("Nghe thử giọng mẫu", open=False):
166
  ref_audio_preview = gr.Audio(interactive=False, show_label=False)
167
  ref_text_preview = gr.Markdown("...")
168
 
169
- with gr.TabItem("🎙️ Nhân bản (Clone)", id="custom_mode"):
170
- custom_audio = gr.Audio(label="Audio gốc", type="filepath")
171
- custom_text = gr.Textbox(label="NỘI DUNG AUDIO MẪU", lines=4)
 
 
 
 
 
 
 
172
 
173
- # Cấu hình âm thanh chuyên nghiệp
174
  with gr.Row():
175
  pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
176
  speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
177
 
178
  current_mode = gr.State(value="preset_mode")
179
  gr.Markdown("<br>")
180
- btn_generate = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")
181
 
182
  with gr.Group(elem_classes="st-card result-card"):
183
- audio_output = gr.Audio(label="KẾT QUẢ", interactive=False, autoplay=True)
184
- status_output = gr.Markdown("<p style='text-align: center; color: #818cf8;'>✨ Sẵn sàng</p>")
185
 
186
- gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL AI SOLUTIONS 2025</div>")
187
 
188
  # LOGIC
189
- text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-size: 0.85rem; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
190
- voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
191
  tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
192
  tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
193
  btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])
 
7
  import tempfile
8
  import torch
9
  import librosa
 
10
  import time
11
 
12
+ # IMPORT TỪ FILE ENGINE CỦA BẠN
13
+ from tts_engine import VoiceEngine
14
 
15
+ # CẤU HÌNH REPO NHÂN CỦA KTVOICE
16
  MY_BACKBONE_REPO = "ktvoice/Backbone"
17
  MY_CODEC_REPO = "ktvoice/Codec"
18
 
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+
21
  try:
22
  tts = VoiceEngine(
23
  backbone_repo=MY_BACKBONE_REPO,
 
35
  return np.random.uniform(-0.1, 0.1, 24000*2)
36
  tts = MockTTS()
37
 
38
+ # --- DATA GIỌNG MẪU ---
39
  VOICE_SAMPLES = {
40
  "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
41
  "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
 
49
  "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
50
  }
51
 
 
52
  def load_reference_info(voice_choice):
53
  if voice_choice in VOICE_SAMPLES:
54
  audio_path = VOICE_SAMPLES[voice_choice]["audio"]
 
65
  if not text or text.strip() == "":
66
  return None, "⚠️ Vui lòng nhập nội dung!"
67
 
68
+ # Xửđộ ngắt nghỉ (Pause level)
69
  processed_text = text
70
  if pause_level == "Trung bình":
71
  processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
 
75
  if len(processed_text) > 400:
76
  processed_text = processed_text[:400]
77
 
78
+ # Lấy dữ liệu Reference
79
  if mode_tab == "custom_mode":
80
  if custom_audio is None or not custom_text:
81
  return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
 
86
  with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
87
  ref_text_raw = f.read()
88
 
89
+ # Thực hiện Inference
90
  start_time = time.time()
91
  ref_codes = tts.encode_reference(ref_audio_path)
92
  wav = tts.infer(processed_text, ref_codes, ref_text_raw)
93
 
94
+ # Điều chỉnh Tốc độ
95
  if speed_value != 1.0:
96
  wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
97
 
 
101
  sf.write(tmp_file.name, wav, 24000)
102
  output_path = tmp_file.name
103
 
104
+ return output_path, f"⚡ Hoàn tất: {process_time:.2f}s | Tốc độ: {speed_value}x"
105
  except Exception as e:
106
  return None, f"❌ Lỗi: {str(e)}"
107
 
108
+ # --- UI SETUP (Premium Dark Mode) ---
109
  theme = gr.themes.Default(
110
  primary_hue="indigo",
111
  secondary_hue="blue",
 
121
  )
122
 
123
  css = """
124
+ .main-wrap { max-width: 1240px !important; margin: auto !important; padding: 30px 20px !important; }
125
  .st-card {
126
  border-radius: 16px !important;
127
  border: 1px solid rgba(255,255,255,0.1) !important;
128
+ box-shadow: 0 4px 25px rgba(0,0,0,0.6) !important;
129
  padding: 15px;
130
  }
131
  .result-card {
 
134
  margin-top: 15px;
135
  }
136
  audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
137
+ .footer { text-align: center; margin-top: 50px; color: #475569; font-size: 0.85rem; letter-spacing: 1px; }
138
  """
139
 
 
140
  with gr.Blocks(title="AI Voice Studio") as demo:
 
141
  with gr.Column(elem_classes="main-wrap"):
142
+ # Đã xóa phần Header "VieNeu Studio" theo yêu cầu
143
+
144
  with gr.Row(equal_height=True):
145
+ # CỘT TRÁI: NHẬP VĂN BẢN
146
  with gr.Column(scale=1):
147
  with gr.Group(elem_classes="st-card"):
148
  text_input = gr.Textbox(
149
+ label="VĂN BẢN ĐẦU VÀO",
150
+ placeholder="Nhập nội dung cần chuyển đổi giọng nói...",
151
+ lines=24, # Tăng số dòng để cân bằng với cột phải
152
  show_label=True,
153
  )
154
+ char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-weight: bold; padding: 5px;'>0 / 250</div>")
155
 
156
+ # CỘT PHẢI: CẤU HÌNH
157
  with gr.Column(scale=1):
158
  with gr.Tabs() as tabs:
159
+ with gr.TabItem("👤 Giọng Mẫu", id="preset_mode"):
160
  voice_select = gr.Dropdown(
161
  choices=list(VOICE_SAMPLES.keys()),
162
  value="Tuyên (nam miền Bắc)",
163
+ label="Lựa chọn nghệ sĩ",
164
  )
165
  with gr.Accordion("Nghe thử giọng mẫu", open=False):
166
  ref_audio_preview = gr.Audio(interactive=False, show_label=False)
167
  ref_text_preview = gr.Markdown("...")
168
 
169
+ with gr.TabItem("🎙️ Tự Clone", id="custom_mode"):
170
+ gr.Markdown("<p style='color: #94a3b8; font-size: 0.85rem; margin-bottom: 5px;'>Tải lên audio nguồn để hệ thống mô phỏng giọng nói.</p>")
171
+ custom_audio = gr.Audio(label="Audio mẫu (.wav/mp3)", type="filepath")
172
+ # Ô nội dung mẫu được làm rộng hơn (lines=6)
173
+ custom_text = gr.Textbox(
174
+ label="NỘI DUNG AUDIO MẪU",
175
+ placeholder="Nhập chính xác lời thoại của audio mẫu để AI học nhịp điệu...",
176
+ lines=6,
177
+ show_label=True
178
+ )
179
 
 
180
  with gr.Row():
181
  pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
182
  speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
183
 
184
  current_mode = gr.State(value="preset_mode")
185
  gr.Markdown("<br>")
186
+ btn_generate = gr.Button("TỔNG HỢP GIỌNG NÓI", variant="primary", size="lg")
187
 
188
  with gr.Group(elem_classes="st-card result-card"):
189
+ audio_output = gr.Audio(label="KẾT QUẢ ÂM THANH", interactive=False, autoplay=True)
190
+ status_output = gr.Markdown("<p style='text-align: center; color: #818cf8; font-weight: 500;'>✨ Hệ thống sẵn sàng thực hiện</p>")
191
 
192
+ gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL STUDIO EDITION 2025</div>")
193
 
194
  # LOGIC
195
+ text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
196
+ voice_select.change(lambda v: load_reference_info(v), voice_select, [ref_audio_preview, ref_text_preview])
197
  tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
198
  tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
199
  btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])
packages.txt CHANGED
@@ -1,3 +1,2 @@
1
  espeak-ng
2
- libespeak-ng1
3
  ffmpeg
 
1
  espeak-ng
 
2
  ffmpeg
tts_engine.py CHANGED
@@ -36,9 +36,9 @@ def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
36
  class VoiceEngine:
37
  def __init__(
38
  self,
39
- backbone_repo="pnnbao-ump/VieNeu-TTS",
40
  backbone_device="cpu",
41
- codec_repo="neuphonic/neucodec",
42
  codec_device="cpu",
43
  ):
44
 
@@ -52,14 +52,13 @@ class VoiceEngine:
52
  self.streaming_lookback = 50
53
  self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
54
 
55
- # ggml & onnx flags
56
  self._is_quantized_model = False
57
  self._is_onnx_codec = False
58
 
59
- # HF tokenizer
60
  self.tokenizer = None
61
 
62
- # Load models
63
  self._load_backbone(backbone_repo, backbone_device)
64
  self._load_codec(codec_repo, codec_device)
65
 
@@ -70,11 +69,7 @@ class VoiceEngine:
70
  try:
71
  from llama_cpp import Llama
72
  except ImportError as e:
73
- raise ImportError(
74
- "Failed to import `llama_cpp`. "
75
- "Please install it with:\n"
76
- " pip install llama-cpp-python"
77
- ) from e
78
  self.backbone = Llama.from_pretrained(
79
  repo_id=backbone_repo,
80
  filename="*.gguf",
@@ -85,7 +80,6 @@ class VoiceEngine:
85
  flash_attn=True if backbone_device == "gpu" else False,
86
  )
87
  self._is_quantized_model = True
88
-
89
  else:
90
  self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
91
  self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
@@ -94,98 +88,55 @@ class VoiceEngine:
94
 
95
  def _load_codec(self, codec_repo, codec_device):
96
  print(f"Loading codec from: {codec_repo} on {codec_device} ...")
97
- match codec_repo:
98
- case "neuphonic/neucodec":
99
- self.codec = NeuCodec.from_pretrained(codec_repo)
100
- self.codec.eval().to(codec_device)
101
- case "neuphonic/distill-neucodec":
102
- self.codec = DistillNeuCodec.from_pretrained(codec_repo)
103
- self.codec.eval().to(codec_device)
104
- case "neuphonic/neucodec-onnx-decoder":
105
- if codec_device != "cpu":
106
- raise ValueError("Onnx decoder only currently runs on CPU.")
107
- try:
108
- from neucodec import NeuCodecOnnxDecoder
109
- except ImportError as e:
110
- raise ImportError(
111
- "Failed to import the onnx decoder."
112
- " Ensure you have onnxruntime installed as well as neucodec >= 0.0.4."
113
- ) from e
114
- self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
115
- self._is_onnx_codec = True
116
- case _:
117
- raise ValueError(f"Unsupported codec repository: {codec_repo}")
118
 
119
  def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
120
- """
121
- Perform inference to generate speech from text using the TTS model and reference audio.
122
-
123
- Args:
124
- text (str): Input text to be converted to speech.
125
- ref_codes (np.ndarray | torch.tensor): Encoded reference.
126
- ref_text (str): Reference text for reference audio. Defaults to None.
127
- Returns:
128
- np.ndarray: Generated speech waveform.
129
- """
130
-
131
- # Generate tokens
132
  if self._is_quantized_model:
133
  output_str = self._infer_ggml(ref_codes, ref_text, text)
134
  else:
135
  prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
136
  output_str = self._infer_torch(prompt_ids)
137
 
138
- # Decode
139
  wav = self._decode(output_str)
140
-
141
  return wav
142
 
143
- def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
144
- """
145
- Perform streaming inference to generate speech from text using the TTS model and reference audio.
146
-
147
- Args:
148
- text (str): Input text to be converted to speech.
149
- ref_codes (np.ndarray | torch.tensor): Encoded reference.
150
- ref_text (str): Reference text for reference audio. Defaults to None.
151
- Yields:
152
- np.ndarray: Generated speech waveform.
153
- """
154
-
155
- if self._is_quantized_model:
156
- return self._infer_stream_ggml(ref_codes, ref_text, text)
157
- else:
158
- raise NotImplementedError("Streaming is not implemented for the torch backend!")
159
-
160
  def encode_reference(self, ref_audio_path: str | Path):
161
  wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
162
- wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0) # [1, 1, T]
163
  with torch.no_grad():
164
  ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
165
  return ref_codes
166
 
167
  def _decode(self, codes: str):
168
- """Decode speech tokens to audio waveform."""
169
- # Extract speech token IDs using regex
170
  speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
171
-
172
  if len(speech_ids) == 0:
173
- raise ValueError(
174
- "No valid speech tokens found in the output. "
175
- "The model may not have generated proper speech tokens."
176
- )
177
 
178
- # Onnx decode
179
  if self._is_onnx_codec:
180
- codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
181
- recon = self.codec.decode_code(codes)
182
- # Torch decode
183
  else:
184
  with torch.no_grad():
185
- codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
186
- self.codec.device
187
- )
188
- recon = self.codec.decode_code(codes).cpu().numpy()
189
 
190
  return recon[0, 0, :]
191
 
@@ -199,17 +150,11 @@ class VoiceEngine:
199
  text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
200
 
201
  input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
202
- chat = """user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"""
203
  ids = self.tokenizer.encode(chat)
204
 
205
  text_replace_idx = ids.index(text_replace)
206
- ids = (
207
- ids[:text_replace_idx]
208
- + [text_prompt_start]
209
- + input_ids
210
- + [text_prompt_end]
211
- + ids[text_replace_idx + 1 :] # noqa
212
- )
213
 
214
  speech_replace_idx = ids.index(speech_replace)
215
  codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
@@ -236,111 +181,4 @@ class VoiceEngine:
236
  output_str = self.tokenizer.decode(
237
  output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
238
  )
239
- return output_str
240
-
241
- def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
242
- ref_text = phonemize_with_dict(ref_text)
243
- input_text = phonemize_with_dict(input_text)
244
-
245
- codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
246
- prompt = (
247
- f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
248
- f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
249
- )
250
- output = self.backbone(
251
- prompt,
252
- max_tokens=self.max_context,
253
- temperature=1.0,
254
- top_k=50,
255
- stop=["<|SPEECH_GENERATION_END|>"],
256
- )
257
- output_str = output["choices"][0]["text"]
258
- return output_str
259
-
260
- def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
261
- ref_text = phonemize_with_dict(ref_text)
262
- input_text = phonemize_with_dict(input_text)
263
-
264
- codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
265
- prompt = (
266
- f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
267
- f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
268
- )
269
-
270
- audio_cache: list[np.ndarray] = []
271
- token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
272
- n_decoded_samples: int = 0
273
- n_decoded_tokens: int = len(ref_codes)
274
-
275
- for item in self.backbone(
276
- prompt,
277
- max_tokens=self.max_context,
278
- temperature=0.2,
279
- top_k=50,
280
- stop=["<|SPEECH_GENERATION_END|>"],
281
- stream=True
282
- ):
283
- output_str = item["choices"][0]["text"]
284
- token_cache.append(output_str)
285
-
286
- if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
287
-
288
- # decode chunk
289
- tokens_start = max(
290
- n_decoded_tokens
291
- - self.streaming_lookback
292
- - self.streaming_overlap_frames,
293
- 0
294
- )
295
- tokens_end = (
296
- n_decoded_tokens
297
- + self.streaming_frames_per_chunk
298
- + self.streaming_lookforward
299
- + self.streaming_overlap_frames
300
- )
301
- sample_start = (
302
- n_decoded_tokens - tokens_start
303
- ) * self.hop_length
304
- sample_end = (
305
- sample_start
306
- + (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
307
- )
308
- curr_codes = token_cache[tokens_start:tokens_end]
309
- recon = self._decode("".join(curr_codes))
310
- recon = recon[sample_start:sample_end]
311
- audio_cache.append(recon)
312
-
313
- # postprocess
314
- processed_recon = _linear_overlap_add(
315
- audio_cache, stride=self.streaming_stride_samples
316
- )
317
- new_samples_end = len(audio_cache) * self.streaming_stride_samples
318
- processed_recon = processed_recon[
319
- n_decoded_samples:new_samples_end
320
- ]
321
- n_decoded_samples = new_samples_end
322
- n_decoded_tokens += self.streaming_frames_per_chunk
323
- yield processed_recon
324
-
325
- # final decoding handled separately as non-constant chunk size
326
- remaining_tokens = len(token_cache) - n_decoded_tokens
327
- if len(token_cache) > n_decoded_tokens:
328
- tokens_start = max(
329
- len(token_cache)
330
- - (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
331
- 0
332
- )
333
- sample_start = (
334
- len(token_cache)
335
- - tokens_start
336
- - remaining_tokens
337
- - self.streaming_overlap_frames
338
- ) * self.hop_length
339
- curr_codes = token_cache[tokens_start:]
340
- recon = self._decode("".join(curr_codes))
341
- recon = recon[sample_start:]
342
- audio_cache.append(recon)
343
-
344
- processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
345
- processed_recon = processed_recon[n_decoded_samples:]
346
- yield processed_recon
 
36
  class VoiceEngine:
37
  def __init__(
38
  self,
39
+ backbone_repo="ktvoice/Backbone", # Thiết lập mặc định về repo của bạn
40
  backbone_device="cpu",
41
+ codec_repo="ktvoice/Codec", # Thiết lập mặc định về repo của bạn
42
  codec_device="cpu",
43
  ):
44
 
 
52
  self.streaming_lookback = 50
53
  self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
54
 
55
+ # Flags
56
  self._is_quantized_model = False
57
  self._is_onnx_codec = False
58
 
 
59
  self.tokenizer = None
60
 
61
+ # Khởi tạo mô hình
62
  self._load_backbone(backbone_repo, backbone_device)
63
  self._load_codec(codec_repo, codec_device)
64
 
 
69
  try:
70
  from llama_cpp import Llama
71
  except ImportError as e:
72
+ raise ImportError("Vui lòng cài đặt llama-cpp-python để dùng model GGUF.") from e
 
 
 
 
73
  self.backbone = Llama.from_pretrained(
74
  repo_id=backbone_repo,
75
  filename="*.gguf",
 
80
  flash_attn=True if backbone_device == "gpu" else False,
81
  )
82
  self._is_quantized_model = True
 
83
  else:
84
  self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
85
  self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
 
88
 
89
  def _load_codec(self, codec_repo, codec_device):
90
  print(f"Loading codec from: {codec_repo} on {codec_device} ...")
91
+
92
+ # Cập nhật logic load codec linh hoạt hơn để chấp nhận repo ktvoice/Codec
93
+ codec_repo_lower = codec_repo.lower()
94
+
95
+ if "distill" in codec_repo_lower:
96
+ self.codec = DistillNeuCodec.from_pretrained(codec_repo)
97
+ elif "onnx" in codec_repo_lower:
98
+ try:
99
+ from neucodec import NeuCodecOnnxDecoder
100
+ except ImportError as e:
101
+ raise ImportError("Vui lòng cài đặt onnxruntime và neucodec >= 0.0.4.") from e
102
+ self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
103
+ self._is_onnx_codec = True
104
+ else:
105
+ # Mặc định load NeuCodec (phù hợp với repository ktvoice/Codec của bạn)
106
+ self.codec = NeuCodec.from_pretrained(codec_repo)
107
+
108
+ if not self._is_onnx_codec:
109
+ self.codec.eval().to(codec_device)
 
 
110
 
111
  def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
 
 
112
  if self._is_quantized_model:
113
  output_str = self._infer_ggml(ref_codes, ref_text, text)
114
  else:
115
  prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
116
  output_str = self._infer_torch(prompt_ids)
117
 
 
118
  wav = self._decode(output_str)
 
119
  return wav
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def encode_reference(self, ref_audio_path: str | Path):
122
  wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
123
+ wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
124
  with torch.no_grad():
125
  ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
126
  return ref_codes
127
 
128
  def _decode(self, codes: str):
 
 
129
  speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
 
130
  if len(speech_ids) == 0:
131
+ raise ValueError("Hệ thống không tạo được token speech hợp lệ.")
 
 
 
132
 
 
133
  if self._is_onnx_codec:
134
+ codes_np = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
135
+ recon = self.codec.decode_code(codes_np)
 
136
  else:
137
  with torch.no_grad():
138
+ codes_tensor = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(self.codec.device)
139
+ recon = self.codec.decode_code(codes_tensor).cpu().numpy()
 
 
140
 
141
  return recon[0, 0, :]
142
 
 
150
  text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
151
 
152
  input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
153
+ chat = "user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"
154
  ids = self.tokenizer.encode(chat)
155
 
156
  text_replace_idx = ids.index(text_replace)
157
+ ids = ids[:text_replace_idx] + [text_prompt_start] + input_ids + [text_prompt_end] + ids[text_replace_idx + 1 :]
 
 
 
 
 
 
158
 
159
  speech_replace_idx = ids.index(speech_replace)
160
  codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
 
181
  output_str = self.tokenizer.decode(
182
  output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
183
  )
184
+ return output_str