kt005

Running

App Files Files Community

ktvoice commited on 27 days ago

Commit

6b6469c

verified ·

1 Parent(s): e16bdd9

Upload 6 files

Browse files

Files changed (2) hide show

app.py +51 -115
config.yaml +28 -26

app.py CHANGED Viewed

@@ -12,17 +12,18 @@ import time
 # IMPORT TỪ FILE ENGINE CỦA BẠN
 from tts_engine import VoiceEngine
-# CẤU HÌNH REPO CÁ NHÂN CỦA KTVOICE
-MY_BACKBONE_REPO = "ktvoice/Backbone"
-MY_CODEC_REPO = "ktvoice/Codec"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     tts = VoiceEngine(
-        backbone_repo=MY_BACKBONE_REPO,
         backbone_device=device,
-        codec_repo=MY_CODEC_REPO,
         codec_device=device
     )
 except Exception as e:
@@ -35,7 +36,7 @@ except Exception as e:
             return np.random.uniform(-0.1, 0.1, 24000*2)
     tts = MockTTS()
-# --- DATA GIỌNG MẪU ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
@@ -62,141 +63,76 @@ def load_reference_info(voice_choice):
 @spaces.GPU(duration=120)
 def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
     try:
-        if not text or text.strip() == "":
-            return None, "⚠️ Vui lòng nhập nội dung!"
-        # Xử lý độ ngắt nghỉ (Pause level)
-        processed_text = text
-        if pause_level == "Trung bình":
-            processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
-        elif pause_level == "Dài":
-            processed_text = processed_text.replace(",", ", , , ").replace(".", ". . . . ")
-        if len(processed_text) > 400:
-             processed_text = processed_text[:400]
-        # Lấy dữ liệu Reference
         if mode_tab == "custom_mode":
-            if custom_audio is None or not custom_text:
-                return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
-            ref_audio_path = custom_audio
-            ref_text_raw = custom_text
         else:
-            ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
             with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
                 ref_text_raw = f.read()
-        # Thực hiện Inference
         start_time = time.time()
-        ref_codes = tts.encode_reference(ref_audio_path)
-        wav = tts.infer(processed_text, ref_codes, ref_text_raw)
-        # Điều chỉnh Tốc độ
         if speed_value != 1.0:
             wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
-        process_time = time.time() - start_time
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-            sf.write(tmp_file.name, wav, 24000)
-            output_path = tmp_file.name
-        return output_path, f"⚡ Hoàn tất: {process_time:.2f}s | Tốc độ: {speed_value}x"
-    except Exception as e:
-        return None, f"❌ Lỗi: {str(e)}"
-# --- UI SETUP (Premium Dark Mode) ---
-theme = gr.themes.Default(
-    primary_hue="indigo",
-    secondary_hue="blue",
-    neutral_hue="slate",
-    font=[gr.themes.GoogleFont('Inter'), 'sans-serif'],
-).set(
-    body_background_fill="#020617",
-    block_background_fill="#0f172a",
-    block_border_width="1px",
-    input_background_fill="#1e293b",
-    input_border_color="#334155",
-    button_primary_background_fill="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
 )
-css = """
-.main-wrap { max-width: 1240px !important; margin: auto !important; padding: 30px 20px !important; }
-.st-card {
-    border-radius: 16px !important;
-    border: 1px solid rgba(255,255,255,0.1) !important;
-    box-shadow: 0 4px 25px rgba(0,0,0,0.6) !important;
-    padding: 15px;
-}
-.result-card {
-    background: linear-gradient(180deg, rgba(15, 23, 42, 0.8) 0%, rgba(30, 41, 59, 0.8) 100%) !important;
-    border: 1px solid rgba(99, 102, 241, 0.2) !important;
-    margin-top: 15px;
-}
-audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
-.footer { text-align: center; margin-top: 50px; color: #475569; font-size: 0.85rem; letter-spacing: 1px; }
-"""
 with gr.Blocks(title="AI Voice Studio") as demo:
     with gr.Column(elem_classes="main-wrap"):
-        # Đã xóa phần Header "VieNeu Studio" theo yêu cầu
         with gr.Row(equal_height=True):
-            # CỘT TRÁI: NHẬP VĂN BẢN
             with gr.Column(scale=1):
                 with gr.Group(elem_classes="st-card"):
-                    text_input = gr.Textbox(
-                        label="VĂN BẢN ĐẦU VÀO",
-                        placeholder="Nhập nội dung cần chuyển đổi giọng nói...",
-                        lines=24, # Tăng số dòng để cân bằng với cột phải
-                        show_label=True,
-                    )
-                    char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-weight: bold; padding: 5px;'>0 / 250</div>")
-            # CỘT PHẢI: CẤU HÌNH
             with gr.Column(scale=1):
                 with gr.Tabs() as tabs:
-                    with gr.TabItem("👤 Giọng Mẫu", id="preset_mode"):
-                        voice_select = gr.Dropdown(
-                            choices=list(VOICE_SAMPLES.keys()),
-                            value="Tuyên (nam miền Bắc)",
-                            label="Lựa chọn nghệ sĩ",
-                        )
-                        with gr.Accordion("Nghe thử giọng mẫu", open=False):
-                            ref_audio_preview = gr.Audio(interactive=False, show_label=False)
-                            ref_text_preview = gr.Markdown("...")
-                    with gr.TabItem("🎙️ Tự Clone", id="custom_mode"):
-                        gr.Markdown("<p style='color: #94a3b8; font-size: 0.85rem; margin-bottom: 5px;'>Tải lên audio nguồn để hệ thống mô phỏng giọng nói.</p>")
-                        custom_audio = gr.Audio(label="Audio mẫu (.wav/mp3)", type="filepath")
-                        # Ô nội dung mẫu được làm rộng hơn (lines=6)
-                        custom_text = gr.Textbox(
-                            label="NỘI DUNG AUDIO MẪU",
-                            placeholder="Nhập chính xác lời thoại của audio mẫu để AI học nhịp điệu...",
-                            lines=6,
-                            show_label=True
-                        )
                 with gr.Row():
-                    pause_level = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ", scale=1)
-                    speed_select = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ đọc", scale=1)
-                current_mode = gr.State(value="preset_mode")
                 gr.Markdown("<br>")
-                btn_generate = gr.Button("TỔNG HỢP GIỌNG NÓI", variant="primary", size="lg")
-                with gr.Group(elem_classes="st-card result-card"):
-                    audio_output = gr.Audio(label="KẾT QUẢ ÂM THANH", interactive=False, autoplay=True)
-                    status_output = gr.Markdown("<p style='text-align: center; color: #818cf8; font-weight: 500;'>✨ Hệ thống sẵn sàng thực hiện</p>")
-        gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL STUDIO EDITION 2025</div>")
     # LOGIC
-    text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-weight: bold; padding: 5px;'>{len(t)} / 250</div>", text_input, char_count)
-    voice_select.change(lambda v: load_reference_info(v), voice_select, [ref_audio_preview, ref_text_preview])
-    tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
-    tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)
-    btn_generate.click(fn=synthesize_speech, inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select], outputs=[audio_output, status_output])
 if __name__ == "__main__":
-    demo.queue().launch(theme=theme, css=css, server_name="0.0.0.0", server_port=7860)

 # IMPORT TỪ FILE ENGINE CỦA BẠN
 from tts_engine import VoiceEngine
+# --- 1. SETUP MODEL (SỬ DỤNG CHÍNH XÁC REPO CỦA BẠN) ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Đã chuyển hướng về tài khoản ktvoice của bạn
+MY_BACKBONE = "ktvoice/Backbone"
+MY_CODEC = "ktvoice/Codec"
 try:
     tts = VoiceEngine(
+        backbone_repo=MY_BACKBONE,
         backbone_device=device,
+        codec_repo=MY_CODEC,
         codec_device=device
     )
 except Exception as e:
             return np.random.uniform(-0.1, 0.1, 24000*2)
     tts = MockTTS()
+# --- 2. DATA (Giọng mẫu cục bộ) ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
 @spaces.GPU(duration=120)
 def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
     try:
+        if not text or text.strip() == "": return None, "⚠️ Vui lòng nhập nội dung!"
+        # Tiền xử lý độ ngắt nghỉ
+        p_text = text
+        if pause_level == "Trung bình": p_text = p_text.replace(",", ", , ").replace(".", ". . ")
+        elif pause_level == "Dài": p_text = p_text.replace(",", ", , , ").replace(".", ". . . . ")
         if mode_tab == "custom_mode":
+            ref_path, ref_text_raw = custom_audio, custom_text
         else:
+            ref_path = VOICE_SAMPLES[voice_choice]["audio"]
             with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
                 ref_text_raw = f.read()
         start_time = time.time()
+        ref_codes = tts.encode_reference(ref_path)
+        wav = tts.infer(p_text[:400], ref_codes, ref_text_raw)
+        # Điều chỉnh tốc độ
         if speed_value != 1.0:
             wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+            sf.write(tmp.name, wav, 24000)
+            output_path = tmp.name
+        return output_path, f"⚡ Hoàn tất: {time.time() - start_time:.2f}s"
+    except Exception as e: return None, f"❌ Lỗi: {str(e)}"
+# --- UI SETUP ---
+theme = gr.themes.Default(primary_hue="indigo", neutral_hue="slate").set(
+    body_background_fill="#020617", block_background_fill="#0f172a",
+    input_background_fill="#1e293b", button_primary_background_fill="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
 )
+css = ".main-wrap { max-width: 1240px !important; margin: auto !important; padding: 20px !important; } .st-card { border-radius: 16px !important; border: 1px solid rgba(255,255,255,0.1) !important; padding: 15px; background: #0f172a !important; } audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; }"
 with gr.Blocks(title="AI Voice Studio") as demo:
     with gr.Column(elem_classes="main-wrap"):
         with gr.Row(equal_height=True):
             with gr.Column(scale=1):
                 with gr.Group(elem_classes="st-card"):
+                    text_input = gr.Textbox(label="VĂN BẢN ĐẦU VÀO", lines=24, show_label=True)
+                    char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-weight: bold;'>0 / 250</div>")
             with gr.Column(scale=1):
                 with gr.Tabs() as tabs:
+                    with gr.TabItem("👤 Giọng Mẫu", id="p_mode"):
+                        voice_select = gr.Dropdown(choices=list(VOICE_SAMPLES.keys()), value="Tuyên (nam miền Bắc)", label="Nghệ sĩ đọc")
+                        with gr.Accordion("Nghe thử", open=False):
+                            ref_p = gr.Audio(interactive=False, show_label=False)
+                            ref_t = gr.Markdown("...")
+                    with gr.TabItem("🎙️ Tự Clone", id="c_mode"):
+                        c_audio = gr.Audio(label="Audio gốc", type="filepath")
+                        c_text = gr.Textbox(label="NỘI DUNG MẪU", lines=6)
                 with gr.Row():
+                    p_lvl = gr.Radio(choices=["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Độ ngắt nghỉ")
+                    s_val = gr.Dropdown(choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5], value=1.0, label="Tốc độ")
+                c_mode = gr.State(value="p_mode")
                 gr.Markdown("<br>")
+                btn = gr.Button("TỔNG HỢP NGAY", variant="primary", size="lg")
+                with gr.Group(elem_classes="st-card"):
+                    a_out = gr.Audio(label="KẾT QUẢ", interactive=False, autoplay=True)
+        gr.HTML("<div style='text-align: center; margin-top: 40px; color: #475569;'>AI VOICE STUDIO • 2025</div>")
     # LOGIC
+    text_input.change(lambda t: f"<div style='text-align: right; color: {'#6366f1' if len(t)<=250 else '#f43f5e'}; font-weight: bold;'>{len(t)} / 250</div>", text_input, char_count)
+    voice_select.change(load_reference_info, voice_select, [ref_p, ref_t])
+    tabs.children[0].select(fn=lambda: "p_mode", outputs=c_mode)
+    tabs.children[1].select(fn=lambda: "c_mode", outputs=c_mode)
+    btn.click(synthesize_speech, [text_input, voice_select, c_audio, c_text, c_mode, p_lvl, s_val], [a_out])
 if __name__ == "__main__":
+    demo.queue().launch(theme=theme, css=css)

config.yaml CHANGED Viewed

@@ -1,65 +1,67 @@
 text_settings:
-  max_chars_per_chunk: 256
   max_total_chars_streaming: 3000
 backbone_configs:
-  "VieNeu-TTS (GPU)":
-    repo: pnnbao-ump/VieNeu-TTS
     supports_streaming: false
-    description: Chất lượng cao nhất, yêu cầu GPU
-  "VieNeu-TTS-q8-gguf":
-    repo: pnnbao-ump/VieNeu-TTS-q8-gguf
     supports_streaming: true
-    description: Cân bằng giữa chất lượng và tốc độ
-  "VieNeu-TTS-q4-gguf":
-    repo: pnnbao-ump/VieNeu-TTS-q4-gguf
     supports_streaming: true
-    description: Nhẹ nhất, phù hợp CPU
 codec_configs:
-  "NeuCodec (Standard)":
-    repo: neuphonic/neucodec
-    description: Codec chuẩn, tốc độ trung bình
     use_preencoded: false
-  "NeuCodec ONNX (Fast CPU)":
-    repo: neuphonic/neucodec-onnx-decoder
-    description: Tối ưu cho CPU, cần pre-encoded codes
     use_preencoded: true
 voice_samples:
   "Tuyên (nam miền Bắc)":
     audio: ./sample/Tuyên (nam miền Bắc).wav
     text: ./sample/Tuyên (nam miền Bắc).txt
-    codes: ./sample/Tuyên (nam miền Bắc).pt
   "Vĩnh (nam miền Nam)":
     audio: ./sample/Vĩnh (nam miền Nam).wav
     text: ./sample/Vĩnh (nam miền Nam).txt
-    codes: ./sample/Vĩnh (nam miền Nam).pt
   "Bình (nam miền Bắc)":
     audio: ./sample/Bình (nam miền Bắc).wav
     text: ./sample/Bình (nam miền Bắc).txt
-    codes: ./sample/Bình (nam miền Bắc).pt
   "Nguyên (nam miền Nam)":
     audio: ./sample/Nguyên (nam miền Nam).wav
     text: ./sample/Nguyên (nam miền Nam).txt
-    codes: ./sample/Nguyên (nam miền Nam).pt
   "Sơn (nam miền Nam)":
     audio: ./sample/Sơn (nam miền Nam).wav
     text: ./sample/Sơn (nam miền Nam).txt
-    codes: ./sample/Sơn (nam miền Nam).pt
   "Đoan (nữ miền Nam)":
     audio: ./sample/Đoan (nữ miền Nam).wav
     text: ./sample/Đoan (nữ miền Nam).txt
-    codes: ./sample/Đoan (nữ miền Nam).pt
   "Ngọc (nữ miền Bắc)":
     audio: ./sample/Ngọc (nữ miền Bắc).wav
     text: ./sample/Ngọc (nữ miền Bắc).txt
-    codes: ./sample/Ngọc (nữ miền Bắc).pt
   "Ly (nữ miền Bắc)":
     audio: ./sample/Ly (nữ miền Bắc).wav
     text: ./sample/Ly (nữ miền Bắc).txt
-    codes: ./sample/Ly (nữ miền Bắc).pt
   "Dung (nữ miền Nam)":
     audio: ./sample/Dung (nữ miền Nam).wav
-    text: ./sample/Dung (nữ miền Nam).txt
-    codes: ./sample/Dung (nữ miền Nam).pt

+# Cấu hình AI Voice Studio - Professional System Configuration
 text_settings:
+  max_chars_per_chunk: 250
   max_total_chars_streaming: 3000
+# Cấu hình các dòng mô hình trí tuệ nhân tạo (Backbone) - Trỏ về tài khoản ktvoice
 backbone_configs:
+  "AI Engine - Premium (Standard)":
+    repo: ktvoice/Backbone
     supports_streaming: false
+    description: Chất lượng âm thanh cao nhất, sử dụng model gốc từ tài khoản ktvoice.
+  "AI Engine - Balanced (q8)":
+    repo: ktvoice/Backbone-q8-gguf
     supports_streaming: true
+    description: Phiên bản nén q8, cân bằng giữa tốc độ và chất lượng.
+  "AI Engine - Lite (q4 CPU)":
+    repo: ktvoice/Backbone-q4-gguf
     supports_streaming: true
+    description: Tốc độ xử lý cực nhanh trên CPU, tối ưu cho thiết bị cấu hình thấp.
+# Cấu hình các bộ giải mã âm thanh (Codec)
 codec_configs:
+  "Standard High-Fidelity":
+    repo: ktvoice/Codec
+    description: Bộ giải mã tiêu chuẩn cho độ chi tiết âm thanh cao nhất.
     use_preencoded: false
+  "Turbo Decoder (ONNX)":
+    repo: ktvoice/Codec-ONNX
+    description: Bộ giải mã siêu tốc tối ưu cho CPU.
     use_preencoded: true
+# Danh sách đầy đủ 10 nghệ sĩ và giọng đọc mẫu (Khớp hoàn toàn với app.py)
 voice_samples:
   "Tuyên (nam miền Bắc)":
     audio: ./sample/Tuyên (nam miền Bắc).wav
     text: ./sample/Tuyên (nam miền Bắc).txt
+  "Thiện Tâm":
+    audio: ./sample/thientam.mp3
+    text: ./sample/thientam.txt
   "Vĩnh (nam miền Nam)":
     audio: ./sample/Vĩnh (nam miền Nam).wav
     text: ./sample/Vĩnh (nam miền Nam).txt
   "Bình (nam miền Bắc)":
     audio: ./sample/Bình (nam miền Bắc).wav
     text: ./sample/Bình (nam miền Bắc).txt
   "Nguyên (nam miền Nam)":
     audio: ./sample/Nguyên (nam miền Nam).wav
     text: ./sample/Nguyên (nam miền Nam).txt
   "Sơn (nam miền Nam)":
     audio: ./sample/Sơn (nam miền Nam).wav
     text: ./sample/Sơn (nam miền Nam).txt
   "Đoan (nữ miền Nam)":
     audio: ./sample/Đoan (nữ miền Nam).wav
     text: ./sample/Đoan (nữ miền Nam).txt
   "Ngọc (nữ miền Bắc)":
     audio: ./sample/Ngọc (nữ miền Bắc).wav
     text: ./sample/Ngọc (nữ miền Bắc).txt
   "Ly (nữ miền Bắc)":
     audio: ./sample/Ly (nữ miền Bắc).wav
     text: ./sample/Ly (nữ miền Bắc).txt
   "Dung (nữ miền Nam)":
     audio: ./sample/Dung (nữ miền Nam).wav
+    text: ./sample/Dung (nữ miền Nam).txt