Upload 3 files
Browse files- app.py +41 -35
- packages.txt +0 -1
- tts_engine.py +33 -195
app.py
CHANGED
|
@@ -7,16 +7,17 @@ import soundfile as sf
|
|
| 7 |
import tempfile
|
| 8 |
import torch
|
| 9 |
import librosa
|
| 10 |
-
from tts_engine import VoiceEngine
|
| 11 |
import time
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
|
| 16 |
-
#
|
| 17 |
MY_BACKBONE_REPO = "ktvoice/Backbone"
|
| 18 |
MY_CODEC_REPO = "ktvoice/Codec"
|
| 19 |
|
|
|
|
|
|
|
| 20 |
try:
|
| 21 |
tts = VoiceEngine(
|
| 22 |
backbone_repo=MY_BACKBONE_REPO,
|
|
@@ -34,7 +35,7 @@ except Exception as e:
|
|
| 34 |
return np.random.uniform(-0.1, 0.1, 24000*2)
|
| 35 |
tts = MockTTS()
|
| 36 |
|
| 37 |
-
# ---
|
| 38 |
VOICE_SAMPLES = {
|
| 39 |
"Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
|
| 40 |
"Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
|
|
@@ -48,7 +49,6 @@ VOICE_SAMPLES = {
|
|
| 48 |
"Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
|
| 49 |
}
|
| 50 |
|
| 51 |
-
# --- 3. HELPER FUNCTIONS ---
|
| 52 |
def load_reference_info(voice_choice):
|
| 53 |
if voice_choice in VOICE_SAMPLES:
|
| 54 |
audio_path = VOICE_SAMPLES[voice_choice]["audio"]
|
|
@@ -65,7 +65,7 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
|
|
| 65 |
if not text or text.strip() == "":
|
| 66 |
return None, "⚠️ Vui lòng nhập nội dung!"
|
| 67 |
|
| 68 |
-
#
|
| 69 |
processed_text = text
|
| 70 |
if pause_level == "Trung bình":
|
| 71 |
processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
|
|
@@ -75,7 +75,7 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
|
|
| 75 |
if len(processed_text) > 400:
|
| 76 |
processed_text = processed_text[:400]
|
| 77 |
|
| 78 |
-
# Lấy Reference
|
| 79 |
if mode_tab == "custom_mode":
|
| 80 |
if custom_audio is None or not custom_text:
|
| 81 |
return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
|
|
@@ -86,12 +86,12 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
|
|
| 86 |
with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
|
| 87 |
ref_text_raw = f.read()
|
| 88 |
|
| 89 |
-
# Inference
|
| 90 |
start_time = time.time()
|
| 91 |
ref_codes = tts.encode_reference(ref_audio_path)
|
| 92 |
wav = tts.infer(processed_text, ref_codes, ref_text_raw)
|
| 93 |
|
| 94 |
-
# Điều chỉnh
|
| 95 |
if speed_value != 1.0:
|
| 96 |
wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
|
| 97 |
|
|
@@ -101,11 +101,11 @@ def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, p
|
|
| 101 |
sf.write(tmp_file.name, wav, 24000)
|
| 102 |
output_path = tmp_file.name
|
| 103 |
|
| 104 |
-
return output_path, f"⚡
|
| 105 |
except Exception as e:
|
| 106 |
return None, f"❌ Lỗi: {str(e)}"
|
| 107 |
|
| 108 |
-
# ---
|
| 109 |
theme = gr.themes.Default(
|
| 110 |
primary_hue="indigo",
|
| 111 |
secondary_hue="blue",
|
|
@@ -121,11 +121,11 @@ theme = gr.themes.Default(
|
|
| 121 |
)
|
| 122 |
|
| 123 |
css = """
|
| 124 |
-
.main-wrap { max-width:
|
| 125 |
.st-card {
|
| 126 |
border-radius: 16px !important;
|
| 127 |
border: 1px solid rgba(255,255,255,0.1) !important;
|
| 128 |
-
box-shadow: 0 4px
|
| 129 |
padding: 15px;
|
| 130 |
}
|
| 131 |
.result-card {
|
|
@@ -134,60 +134,66 @@ css = """
|
|
| 134 |
margin-top: 15px;
|
| 135 |
}
|
| 136 |
audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
|
| 137 |
-
.footer { text-align: center; margin-top:
|
| 138 |
"""
|
| 139 |
|
| 140 |
-
# --- 5. UI CONSTRUCTION ---
|
| 141 |
with gr.Blocks(title="AI Voice Studio") as demo:
|
| 142 |
-
|
| 143 |
with gr.Column(elem_classes="main-wrap"):
|
|
|
|
|
|
|
| 144 |
with gr.Row(equal_height=True):
|
| 145 |
-
# TRÁI:
|
| 146 |
with gr.Column(scale=1):
|
| 147 |
with gr.Group(elem_classes="st-card"):
|
| 148 |
text_input = gr.Textbox(
|
| 149 |
-
label="VĂN BẢN
|
| 150 |
-
placeholder="
|
| 151 |
-
lines=
|
| 152 |
show_label=True,
|
| 153 |
)
|
| 154 |
-
char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-
|
| 155 |
|
| 156 |
-
# PHẢI:
|
| 157 |
with gr.Column(scale=1):
|
| 158 |
with gr.Tabs() as tabs:
|
| 159 |
-
with gr.TabItem("👤
|
| 160 |
voice_select = gr.Dropdown(
|
| 161 |
choices=list(VOICE_SAMPLES.keys()),
|
| 162 |
value="Tuyên (nam miền Bắc)",
|
| 163 |
-
label="
|
| 164 |
)
|
| 165 |
with gr.Accordion("Nghe thử giọng mẫu", open=False):
|
| 166 |
ref_audio_preview = gr.Audio(interactive=False, show_label=False)
|
| 167 |
ref_text_preview = gr.Markdown("...")
|
| 168 |
|
| 169 |
-
with gr.TabItem("🎙️
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
-
# Cấu hình âm thanh chuyên nghiệp
|
| 174 |
with gr.Row():
|
| 175 |
pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
|
| 176 |
speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
|
| 177 |
|
| 178 |
current_mode = gr.State(value="preset_mode")
|
| 179 |
gr.Markdown("<br>")
|
| 180 |
-
btn_generate = gr.Button("
|
| 181 |
|
| 182 |
with gr.Group(elem_classes="st-card result-card"):
|
| 183 |
-
audio_output = gr.Audio(label="KẾT QUẢ", interactive=False, autoplay=True)
|
| 184 |
-
status_output = gr.Markdown("<p style='text-align: center; color: #818cf8;'>✨
|
| 185 |
|
| 186 |
-
gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL
|
| 187 |
|
| 188 |
# LOGIC
|
| 189 |
-
text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-
|
| 190 |
-
voice_select.change(
|
| 191 |
tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
|
| 192 |
tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
|
| 193 |
btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])
|
|
|
|
| 7 |
import tempfile
|
| 8 |
import torch
|
| 9 |
import librosa
|
|
|
|
| 10 |
import time
|
| 11 |
|
| 12 |
+
# IMPORT TỪ FILE ENGINE CỦA BẠN
|
| 13 |
+
from tts_engine import VoiceEngine
|
| 14 |
|
| 15 |
+
# CẤU HÌNH REPO CÁ NHÂN CỦA KTVOICE
|
| 16 |
MY_BACKBONE_REPO = "ktvoice/Backbone"
|
| 17 |
MY_CODEC_REPO = "ktvoice/Codec"
|
| 18 |
|
| 19 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
+
|
| 21 |
try:
|
| 22 |
tts = VoiceEngine(
|
| 23 |
backbone_repo=MY_BACKBONE_REPO,
|
|
|
|
| 35 |
return np.random.uniform(-0.1, 0.1, 24000*2)
|
| 36 |
tts = MockTTS()
|
| 37 |
|
| 38 |
+
# --- DATA GIỌNG MẪU ---
|
| 39 |
VOICE_SAMPLES = {
|
| 40 |
"Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
|
| 41 |
"Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
|
|
|
|
| 49 |
"Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
|
| 50 |
}
|
| 51 |
|
|
|
|
| 52 |
def load_reference_info(voice_choice):
|
| 53 |
if voice_choice in VOICE_SAMPLES:
|
| 54 |
audio_path = VOICE_SAMPLES[voice_choice]["audio"]
|
|
|
|
| 65 |
if not text or text.strip() == "":
|
| 66 |
return None, "⚠️ Vui lòng nhập nội dung!"
|
| 67 |
|
| 68 |
+
# Xử lý độ ngắt nghỉ (Pause level)
|
| 69 |
processed_text = text
|
| 70 |
if pause_level == "Trung bình":
|
| 71 |
processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
|
|
|
|
| 75 |
if len(processed_text) > 400:
|
| 76 |
processed_text = processed_text[:400]
|
| 77 |
|
| 78 |
+
# Lấy dữ liệu Reference
|
| 79 |
if mode_tab == "custom_mode":
|
| 80 |
if custom_audio is None or not custom_text:
|
| 81 |
return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
|
|
|
|
| 86 |
with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
|
| 87 |
ref_text_raw = f.read()
|
| 88 |
|
| 89 |
+
# Thực hiện Inference
|
| 90 |
start_time = time.time()
|
| 91 |
ref_codes = tts.encode_reference(ref_audio_path)
|
| 92 |
wav = tts.infer(processed_text, ref_codes, ref_text_raw)
|
| 93 |
|
| 94 |
+
# Điều chỉnh Tốc độ
|
| 95 |
if speed_value != 1.0:
|
| 96 |
wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
|
| 97 |
|
|
|
|
| 101 |
sf.write(tmp_file.name, wav, 24000)
|
| 102 |
output_path = tmp_file.name
|
| 103 |
|
| 104 |
+
return output_path, f"⚡ Hoàn tất: {process_time:.2f}s | Tốc độ: {speed_value}x"
|
| 105 |
except Exception as e:
|
| 106 |
return None, f"❌ Lỗi: {str(e)}"
|
| 107 |
|
| 108 |
+
# --- UI SETUP (Premium Dark Mode) ---
|
| 109 |
theme = gr.themes.Default(
|
| 110 |
primary_hue="indigo",
|
| 111 |
secondary_hue="blue",
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
css = """
|
| 124 |
+
.main-wrap { max-width: 1240px !important; margin: auto !important; padding: 30px 20px !important; }
|
| 125 |
.st-card {
|
| 126 |
border-radius: 16px !important;
|
| 127 |
border: 1px solid rgba(255,255,255,0.1) !important;
|
| 128 |
+
box-shadow: 0 4px 25px rgba(0,0,0,0.6) !important;
|
| 129 |
padding: 15px;
|
| 130 |
}
|
| 131 |
.result-card {
|
|
|
|
| 134 |
margin-top: 15px;
|
| 135 |
}
|
| 136 |
audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
|
| 137 |
+
.footer { text-align: center; margin-top: 50px; color: #475569; font-size: 0.85rem; letter-spacing: 1px; }
|
| 138 |
"""
|
| 139 |
|
|
|
|
| 140 |
with gr.Blocks(title="AI Voice Studio") as demo:
|
|
|
|
| 141 |
with gr.Column(elem_classes="main-wrap"):
|
| 142 |
+
# Đã xóa phần Header "VieNeu Studio" theo yêu cầu
|
| 143 |
+
|
| 144 |
with gr.Row(equal_height=True):
|
| 145 |
+
# CỘT TRÁI: NHẬP VĂN BẢN
|
| 146 |
with gr.Column(scale=1):
|
| 147 |
with gr.Group(elem_classes="st-card"):
|
| 148 |
text_input = gr.Textbox(
|
| 149 |
+
label="VĂN BẢN ĐẦU VÀO",
|
| 150 |
+
placeholder="Nhập nội dung cần chuyển đổi giọng nói...",
|
| 151 |
+
lines=24, # Tăng số dòng để cân bằng với cột phải
|
| 152 |
show_label=True,
|
| 153 |
)
|
| 154 |
+
char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-weight: bold; padding: 5px;'>0 / 250</div>")
|
| 155 |
|
| 156 |
+
# CỘT PHẢI: CẤU HÌNH
|
| 157 |
with gr.Column(scale=1):
|
| 158 |
with gr.Tabs() as tabs:
|
| 159 |
+
with gr.TabItem("👤 Giọng Mẫu", id="preset_mode"):
|
| 160 |
voice_select = gr.Dropdown(
|
| 161 |
choices=list(VOICE_SAMPLES.keys()),
|
| 162 |
value="Tuyên (nam miền Bắc)",
|
| 163 |
+
label="Lựa chọn nghệ sĩ",
|
| 164 |
)
|
| 165 |
with gr.Accordion("Nghe thử giọng mẫu", open=False):
|
| 166 |
ref_audio_preview = gr.Audio(interactive=False, show_label=False)
|
| 167 |
ref_text_preview = gr.Markdown("...")
|
| 168 |
|
| 169 |
+
with gr.TabItem("🎙️ Tự Clone", id="custom_mode"):
|
| 170 |
+
gr.Markdown("<p style='color: #94a3b8; font-size: 0.85rem; margin-bottom: 5px;'>Tải lên audio nguồn để hệ thống mô phỏng giọng nói.</p>")
|
| 171 |
+
custom_audio = gr.Audio(label="Audio mẫu (.wav/mp3)", type="filepath")
|
| 172 |
+
# Ô nội dung mẫu được làm rộng hơn (lines=6)
|
| 173 |
+
custom_text = gr.Textbox(
|
| 174 |
+
label="NỘI DUNG AUDIO MẪU",
|
| 175 |
+
placeholder="Nhập chính xác lời thoại của audio mẫu để AI học nhịp điệu...",
|
| 176 |
+
lines=6,
|
| 177 |
+
show_label=True
|
| 178 |
+
)
|
| 179 |
|
|
|
|
| 180 |
with gr.Row():
|
| 181 |
pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
|
| 182 |
speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
|
| 183 |
|
| 184 |
current_mode = gr.State(value="preset_mode")
|
| 185 |
gr.Markdown("<br>")
|
| 186 |
+
btn_generate = gr.Button("TỔNG HỢP GIỌNG NÓI", variant="primary", size="lg")
|
| 187 |
|
| 188 |
with gr.Group(elem_classes="st-card result-card"):
|
| 189 |
+
audio_output = gr.Audio(label="KẾT QUẢ ÂM THANH", interactive=False, autoplay=True)
|
| 190 |
+
status_output = gr.Markdown("<p style='text-align: center; color: #818cf8; font-weight: 500;'>✨ Hệ thống sẵn sàng thực hiện</p>")
|
| 191 |
|
| 192 |
+
gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL STUDIO EDITION 2025</div>")
|
| 193 |
|
| 194 |
# LOGIC
|
| 195 |
+
text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
|
| 196 |
+
voice_select.change(lambda v: load_reference_info(v), voice_select, [ref_audio_preview, ref_text_preview])
|
| 197 |
tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
|
| 198 |
tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
|
| 199 |
btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])
|
packages.txt
CHANGED
|
@@ -1,3 +1,2 @@
|
|
| 1 |
espeak-ng
|
| 2 |
-
libespeak-ng1
|
| 3 |
ffmpeg
|
|
|
|
| 1 |
espeak-ng
|
|
|
|
| 2 |
ffmpeg
|
tts_engine.py
CHANGED
|
@@ -36,9 +36,9 @@ def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
|
|
| 36 |
class VoiceEngine:
|
| 37 |
def __init__(
|
| 38 |
self,
|
| 39 |
-
backbone_repo="
|
| 40 |
backbone_device="cpu",
|
| 41 |
-
codec_repo="
|
| 42 |
codec_device="cpu",
|
| 43 |
):
|
| 44 |
|
|
@@ -52,14 +52,13 @@ class VoiceEngine:
|
|
| 52 |
self.streaming_lookback = 50
|
| 53 |
self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
|
| 54 |
|
| 55 |
-
#
|
| 56 |
self._is_quantized_model = False
|
| 57 |
self._is_onnx_codec = False
|
| 58 |
|
| 59 |
-
# HF tokenizer
|
| 60 |
self.tokenizer = None
|
| 61 |
|
| 62 |
-
#
|
| 63 |
self._load_backbone(backbone_repo, backbone_device)
|
| 64 |
self._load_codec(codec_repo, codec_device)
|
| 65 |
|
|
@@ -70,11 +69,7 @@ class VoiceEngine:
|
|
| 70 |
try:
|
| 71 |
from llama_cpp import Llama
|
| 72 |
except ImportError as e:
|
| 73 |
-
raise ImportError(
|
| 74 |
-
"Failed to import `llama_cpp`. "
|
| 75 |
-
"Please install it with:\n"
|
| 76 |
-
" pip install llama-cpp-python"
|
| 77 |
-
) from e
|
| 78 |
self.backbone = Llama.from_pretrained(
|
| 79 |
repo_id=backbone_repo,
|
| 80 |
filename="*.gguf",
|
|
@@ -85,7 +80,6 @@ class VoiceEngine:
|
|
| 85 |
flash_attn=True if backbone_device == "gpu" else False,
|
| 86 |
)
|
| 87 |
self._is_quantized_model = True
|
| 88 |
-
|
| 89 |
else:
|
| 90 |
self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
|
| 91 |
self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
|
|
@@ -94,98 +88,55 @@ class VoiceEngine:
|
|
| 94 |
|
| 95 |
def _load_codec(self, codec_repo, codec_device):
|
| 96 |
print(f"Loading codec from: {codec_repo} on {codec_device} ...")
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
case _:
|
| 117 |
-
raise ValueError(f"Unsupported codec repository: {codec_repo}")
|
| 118 |
|
| 119 |
def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
|
| 120 |
-
"""
|
| 121 |
-
Perform inference to generate speech from text using the TTS model and reference audio.
|
| 122 |
-
|
| 123 |
-
Args:
|
| 124 |
-
text (str): Input text to be converted to speech.
|
| 125 |
-
ref_codes (np.ndarray | torch.tensor): Encoded reference.
|
| 126 |
-
ref_text (str): Reference text for reference audio. Defaults to None.
|
| 127 |
-
Returns:
|
| 128 |
-
np.ndarray: Generated speech waveform.
|
| 129 |
-
"""
|
| 130 |
-
|
| 131 |
-
# Generate tokens
|
| 132 |
if self._is_quantized_model:
|
| 133 |
output_str = self._infer_ggml(ref_codes, ref_text, text)
|
| 134 |
else:
|
| 135 |
prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
|
| 136 |
output_str = self._infer_torch(prompt_ids)
|
| 137 |
|
| 138 |
-
# Decode
|
| 139 |
wav = self._decode(output_str)
|
| 140 |
-
|
| 141 |
return wav
|
| 142 |
|
| 143 |
-
def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
|
| 144 |
-
"""
|
| 145 |
-
Perform streaming inference to generate speech from text using the TTS model and reference audio.
|
| 146 |
-
|
| 147 |
-
Args:
|
| 148 |
-
text (str): Input text to be converted to speech.
|
| 149 |
-
ref_codes (np.ndarray | torch.tensor): Encoded reference.
|
| 150 |
-
ref_text (str): Reference text for reference audio. Defaults to None.
|
| 151 |
-
Yields:
|
| 152 |
-
np.ndarray: Generated speech waveform.
|
| 153 |
-
"""
|
| 154 |
-
|
| 155 |
-
if self._is_quantized_model:
|
| 156 |
-
return self._infer_stream_ggml(ref_codes, ref_text, text)
|
| 157 |
-
else:
|
| 158 |
-
raise NotImplementedError("Streaming is not implemented for the torch backend!")
|
| 159 |
-
|
| 160 |
def encode_reference(self, ref_audio_path: str | Path):
|
| 161 |
wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
|
| 162 |
-
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
|
| 163 |
with torch.no_grad():
|
| 164 |
ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
|
| 165 |
return ref_codes
|
| 166 |
|
| 167 |
def _decode(self, codes: str):
|
| 168 |
-
"""Decode speech tokens to audio waveform."""
|
| 169 |
-
# Extract speech token IDs using regex
|
| 170 |
speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
|
| 171 |
-
|
| 172 |
if len(speech_ids) == 0:
|
| 173 |
-
raise ValueError(
|
| 174 |
-
"No valid speech tokens found in the output. "
|
| 175 |
-
"The model may not have generated proper speech tokens."
|
| 176 |
-
)
|
| 177 |
|
| 178 |
-
# Onnx decode
|
| 179 |
if self._is_onnx_codec:
|
| 180 |
-
|
| 181 |
-
recon = self.codec.decode_code(
|
| 182 |
-
# Torch decode
|
| 183 |
else:
|
| 184 |
with torch.no_grad():
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
)
|
| 188 |
-
recon = self.codec.decode_code(codes).cpu().numpy()
|
| 189 |
|
| 190 |
return recon[0, 0, :]
|
| 191 |
|
|
@@ -199,17 +150,11 @@ class VoiceEngine:
|
|
| 199 |
text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
|
| 200 |
|
| 201 |
input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
|
| 202 |
-
chat = "
|
| 203 |
ids = self.tokenizer.encode(chat)
|
| 204 |
|
| 205 |
text_replace_idx = ids.index(text_replace)
|
| 206 |
-
ids =
|
| 207 |
-
ids[:text_replace_idx]
|
| 208 |
-
+ [text_prompt_start]
|
| 209 |
-
+ input_ids
|
| 210 |
-
+ [text_prompt_end]
|
| 211 |
-
+ ids[text_replace_idx + 1 :] # noqa
|
| 212 |
-
)
|
| 213 |
|
| 214 |
speech_replace_idx = ids.index(speech_replace)
|
| 215 |
codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
|
|
@@ -236,111 +181,4 @@ class VoiceEngine:
|
|
| 236 |
output_str = self.tokenizer.decode(
|
| 237 |
output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
|
| 238 |
)
|
| 239 |
-
return output_str
|
| 240 |
-
|
| 241 |
-
def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
|
| 242 |
-
ref_text = phonemize_with_dict(ref_text)
|
| 243 |
-
input_text = phonemize_with_dict(input_text)
|
| 244 |
-
|
| 245 |
-
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
|
| 246 |
-
prompt = (
|
| 247 |
-
f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
|
| 248 |
-
f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
|
| 249 |
-
)
|
| 250 |
-
output = self.backbone(
|
| 251 |
-
prompt,
|
| 252 |
-
max_tokens=self.max_context,
|
| 253 |
-
temperature=1.0,
|
| 254 |
-
top_k=50,
|
| 255 |
-
stop=["<|SPEECH_GENERATION_END|>"],
|
| 256 |
-
)
|
| 257 |
-
output_str = output["choices"][0]["text"]
|
| 258 |
-
return output_str
|
| 259 |
-
|
| 260 |
-
def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
|
| 261 |
-
ref_text = phonemize_with_dict(ref_text)
|
| 262 |
-
input_text = phonemize_with_dict(input_text)
|
| 263 |
-
|
| 264 |
-
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
|
| 265 |
-
prompt = (
|
| 266 |
-
f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
|
| 267 |
-
f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
|
| 268 |
-
)
|
| 269 |
-
|
| 270 |
-
audio_cache: list[np.ndarray] = []
|
| 271 |
-
token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
|
| 272 |
-
n_decoded_samples: int = 0
|
| 273 |
-
n_decoded_tokens: int = len(ref_codes)
|
| 274 |
-
|
| 275 |
-
for item in self.backbone(
|
| 276 |
-
prompt,
|
| 277 |
-
max_tokens=self.max_context,
|
| 278 |
-
temperature=0.2,
|
| 279 |
-
top_k=50,
|
| 280 |
-
stop=["<|SPEECH_GENERATION_END|>"],
|
| 281 |
-
stream=True
|
| 282 |
-
):
|
| 283 |
-
output_str = item["choices"][0]["text"]
|
| 284 |
-
token_cache.append(output_str)
|
| 285 |
-
|
| 286 |
-
if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
|
| 287 |
-
|
| 288 |
-
# decode chunk
|
| 289 |
-
tokens_start = max(
|
| 290 |
-
n_decoded_tokens
|
| 291 |
-
- self.streaming_lookback
|
| 292 |
-
- self.streaming_overlap_frames,
|
| 293 |
-
0
|
| 294 |
-
)
|
| 295 |
-
tokens_end = (
|
| 296 |
-
n_decoded_tokens
|
| 297 |
-
+ self.streaming_frames_per_chunk
|
| 298 |
-
+ self.streaming_lookforward
|
| 299 |
-
+ self.streaming_overlap_frames
|
| 300 |
-
)
|
| 301 |
-
sample_start = (
|
| 302 |
-
n_decoded_tokens - tokens_start
|
| 303 |
-
) * self.hop_length
|
| 304 |
-
sample_end = (
|
| 305 |
-
sample_start
|
| 306 |
-
+ (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
|
| 307 |
-
)
|
| 308 |
-
curr_codes = token_cache[tokens_start:tokens_end]
|
| 309 |
-
recon = self._decode("".join(curr_codes))
|
| 310 |
-
recon = recon[sample_start:sample_end]
|
| 311 |
-
audio_cache.append(recon)
|
| 312 |
-
|
| 313 |
-
# postprocess
|
| 314 |
-
processed_recon = _linear_overlap_add(
|
| 315 |
-
audio_cache, stride=self.streaming_stride_samples
|
| 316 |
-
)
|
| 317 |
-
new_samples_end = len(audio_cache) * self.streaming_stride_samples
|
| 318 |
-
processed_recon = processed_recon[
|
| 319 |
-
n_decoded_samples:new_samples_end
|
| 320 |
-
]
|
| 321 |
-
n_decoded_samples = new_samples_end
|
| 322 |
-
n_decoded_tokens += self.streaming_frames_per_chunk
|
| 323 |
-
yield processed_recon
|
| 324 |
-
|
| 325 |
-
# final decoding handled separately as non-constant chunk size
|
| 326 |
-
remaining_tokens = len(token_cache) - n_decoded_tokens
|
| 327 |
-
if len(token_cache) > n_decoded_tokens:
|
| 328 |
-
tokens_start = max(
|
| 329 |
-
len(token_cache)
|
| 330 |
-
- (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
|
| 331 |
-
0
|
| 332 |
-
)
|
| 333 |
-
sample_start = (
|
| 334 |
-
len(token_cache)
|
| 335 |
-
- tokens_start
|
| 336 |
-
- remaining_tokens
|
| 337 |
-
- self.streaming_overlap_frames
|
| 338 |
-
) * self.hop_length
|
| 339 |
-
curr_codes = token_cache[tokens_start:]
|
| 340 |
-
recon = self._decode("".join(curr_codes))
|
| 341 |
-
recon = recon[sample_start:]
|
| 342 |
-
audio_cache.append(recon)
|
| 343 |
-
|
| 344 |
-
processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
|
| 345 |
-
processed_recon = processed_recon[n_decoded_samples:]
|
| 346 |
-
yield processed_recon
|
|
|
|
| 36 |
class VoiceEngine:
|
| 37 |
def __init__(
|
| 38 |
self,
|
| 39 |
+
backbone_repo="ktvoice/Backbone", # Thiết lập mặc định về repo của bạn
|
| 40 |
backbone_device="cpu",
|
| 41 |
+
codec_repo="ktvoice/Codec", # Thiết lập mặc định về repo của bạn
|
| 42 |
codec_device="cpu",
|
| 43 |
):
|
| 44 |
|
|
|
|
| 52 |
self.streaming_lookback = 50
|
| 53 |
self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
|
| 54 |
|
| 55 |
+
# Flags
|
| 56 |
self._is_quantized_model = False
|
| 57 |
self._is_onnx_codec = False
|
| 58 |
|
|
|
|
| 59 |
self.tokenizer = None
|
| 60 |
|
| 61 |
+
# Khởi tạo mô hình
|
| 62 |
self._load_backbone(backbone_repo, backbone_device)
|
| 63 |
self._load_codec(codec_repo, codec_device)
|
| 64 |
|
|
|
|
| 69 |
try:
|
| 70 |
from llama_cpp import Llama
|
| 71 |
except ImportError as e:
|
| 72 |
+
raise ImportError("Vui lòng cài đặt llama-cpp-python để dùng model GGUF.") from e
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
self.backbone = Llama.from_pretrained(
|
| 74 |
repo_id=backbone_repo,
|
| 75 |
filename="*.gguf",
|
|
|
|
| 80 |
flash_attn=True if backbone_device == "gpu" else False,
|
| 81 |
)
|
| 82 |
self._is_quantized_model = True
|
|
|
|
| 83 |
else:
|
| 84 |
self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
|
| 85 |
self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
|
|
|
|
| 88 |
|
| 89 |
def _load_codec(self, codec_repo, codec_device):
|
| 90 |
print(f"Loading codec from: {codec_repo} on {codec_device} ...")
|
| 91 |
+
|
| 92 |
+
# Cập nhật logic load codec linh hoạt hơn để chấp nhận repo ktvoice/Codec
|
| 93 |
+
codec_repo_lower = codec_repo.lower()
|
| 94 |
+
|
| 95 |
+
if "distill" in codec_repo_lower:
|
| 96 |
+
self.codec = DistillNeuCodec.from_pretrained(codec_repo)
|
| 97 |
+
elif "onnx" in codec_repo_lower:
|
| 98 |
+
try:
|
| 99 |
+
from neucodec import NeuCodecOnnxDecoder
|
| 100 |
+
except ImportError as e:
|
| 101 |
+
raise ImportError("Vui lòng cài đặt onnxruntime và neucodec >= 0.0.4.") from e
|
| 102 |
+
self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
|
| 103 |
+
self._is_onnx_codec = True
|
| 104 |
+
else:
|
| 105 |
+
# Mặc định load NeuCodec (phù hợp với repository ktvoice/Codec của bạn)
|
| 106 |
+
self.codec = NeuCodec.from_pretrained(codec_repo)
|
| 107 |
+
|
| 108 |
+
if not self._is_onnx_codec:
|
| 109 |
+
self.codec.eval().to(codec_device)
|
|
|
|
|
|
|
| 110 |
|
| 111 |
def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
if self._is_quantized_model:
|
| 113 |
output_str = self._infer_ggml(ref_codes, ref_text, text)
|
| 114 |
else:
|
| 115 |
prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
|
| 116 |
output_str = self._infer_torch(prompt_ids)
|
| 117 |
|
|
|
|
| 118 |
wav = self._decode(output_str)
|
|
|
|
| 119 |
return wav
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def encode_reference(self, ref_audio_path: str | Path):
|
| 122 |
wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
|
| 123 |
+
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)
|
| 124 |
with torch.no_grad():
|
| 125 |
ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
|
| 126 |
return ref_codes
|
| 127 |
|
| 128 |
def _decode(self, codes: str):
|
|
|
|
|
|
|
| 129 |
speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
|
|
|
|
| 130 |
if len(speech_ids) == 0:
|
| 131 |
+
raise ValueError("Hệ thống không tạo được token speech hợp lệ.")
|
|
|
|
|
|
|
|
|
|
| 132 |
|
|
|
|
| 133 |
if self._is_onnx_codec:
|
| 134 |
+
codes_np = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
|
| 135 |
+
recon = self.codec.decode_code(codes_np)
|
|
|
|
| 136 |
else:
|
| 137 |
with torch.no_grad():
|
| 138 |
+
codes_tensor = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(self.codec.device)
|
| 139 |
+
recon = self.codec.decode_code(codes_tensor).cpu().numpy()
|
|
|
|
|
|
|
| 140 |
|
| 141 |
return recon[0, 0, :]
|
| 142 |
|
|
|
|
| 150 |
text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
|
| 151 |
|
| 152 |
input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
|
| 153 |
+
chat = "user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"
|
| 154 |
ids = self.tokenizer.encode(chat)
|
| 155 |
|
| 156 |
text_replace_idx = ids.index(text_replace)
|
| 157 |
+
ids = ids[:text_replace_idx] + [text_prompt_start] + input_ids + [text_prompt_end] + ids[text_replace_idx + 1 :]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
speech_replace_idx = ids.index(speech_replace)
|
| 160 |
codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
|
|
|
|
| 181 |
output_str = self.tokenizer.decode(
|
| 182 |
output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
|
| 183 |
)
|
| 184 |
+
return output_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|