TTS-Demo

Sleeping

App Files Files Community

CVNSS commited on 16 days ago

Commit

3cd1c26

verified ·

1 Parent(s): b74755e

Update app.py

Browse files

Files changed (1) hide show

app.py +651 -97

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-CVNSS4.0 Vietnamese TTS Studio
 - Architecture: Modular CSS & Component Separation
-- UX: High Contrast Input Fields
-- Core: Optimized Logic Flow
 """
 import os
@@ -16,12 +15,15 @@ import glob
 import re
 import hashlib
 import tempfile
 from pathlib import Path
 import torch
 import numpy as np
 import soundfile as sf
 import gradio as gr
 # Add src to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -32,18 +34,18 @@ try:
     from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
     from src.models.synthesizer import SynthesizerTrn
     from src.text.symbols import symbols
-except ImportError:
-    # Fallback for environment setup if src is missing during init
-    print("⚠️ Core modules not found. Ensure 'src' directory exists.")
     VIPHONEME_AVAILABLE = False
     symbols = []
 # =========================================================
-# 1) SYSTEM CONFIGURATION & CSS (The Expert Layer)
 # =========================================================
-# Expert CSS: Definitive Z-Index Management & Neon Theme
 NEON_CSS = r"""
 :root {
     --bg-dark: #0f172a;
@@ -52,13 +54,20 @@ NEON_CSS = r"""
     --text-primary: #e2e8f0;
     --neon-cyan: #06b6d4;
     --neon-accent: #38bdf8;
     --radius-lg: 16px;
     --radius-sm: 8px;
     /* UX Color Palette for Inputs */
-    --input-bg: #f1f5f9;       /* Light Blue-Grey for readability */
-    --input-text: #0f4c81;     /* Classic Blue (Dark Blue) for high contrast */
     --input-placeholder: #64748b;
 }
 body, .gradio-container, .app {
@@ -81,10 +90,21 @@ body, .gradio-container, .app {
     margin-bottom: 20px;
 }
 /* UX IMPROVEMENT: High Contrast Input Styling */
 .panelNeon textarea, .panelNeon input[type="text"] {
     background: var(--input-bg) !important;
-    color: var(--input-text) !important; /* DARK BLUE TEXT requested */
     border: 2px solid transparent !important;
     border-radius: var(--radius-sm) !important;
     font-weight: 500 !important;
@@ -104,7 +124,7 @@ body, .gradio-container, .app {
     background: #ffffff !important;
     border-color: var(--neon-cyan) !important;
     box-shadow: 0 0 0 4px rgba(6, 182, 212, 0.15) !important;
-    color: #000000 !important; /* Even darker on focus */
 }
 /* Label Styling */
@@ -131,14 +151,29 @@ button.primary, .gr-button-primary {
     font-weight: 700 !important;
     transition: transform 0.1s ease, box-shadow 0.2s ease;
 }
 button.primary:hover, .gr-button-primary:hover {
     box-shadow: 0 10px 15px -3px rgba(6, 182, 212, 0.3) !important;
     transform: translateY(-1px);
 }
 button.primary:active {
     transform: translateY(0px);
 }
 /* Status Panel */
 .statusCard {
     background: rgba(15, 23, 42, 0.6);
@@ -146,6 +181,7 @@ button.primary:active {
     padding: 16px;
     border: 1px solid rgba(255,255,255,0.05);
 }
 .pill {
     display: inline-flex;
     align-items: center;
@@ -159,14 +195,85 @@ button.primary:active {
     margin-right: 6px;
     margin-bottom: 6px;
 }
-.alert { padding: 12px; border-radius: 8px; margin-top: 12px; font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 8px;}
-.alertOk { background: rgba(34, 197, 94, 0.1); color: #4ade80; border: 1px solid rgba(34, 197, 94, 0.2); }
-.alertWarn { background: rgba(234, 179, 8, 0.1); color: #facc15; border: 1px solid rgba(234, 179, 8, 0.2); }
 """
 # =========================================================
 # 2) UTILITIES & HELPERS
 # =========================================================
 def check_viphoneme():
     if not VIPHONEME_AVAILABLE:
         print("⚠️ Viphoneme not available.")
@@ -183,10 +290,8 @@ def md5_key(*parts: str) -> str:
     return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
 def split_sentences_vi(text: str, max_chars: int):
-    # Improved splitting logic
     if not text: return []
     text = re.sub(r'\s+', ' ', text).strip()
-    # Split by delimiters keeping delimiters
     parts = re.split(r'([.?!;:])', text)
     chunks = []
@@ -212,10 +317,168 @@ def split_sentences_vi(text: str, max_chars: int):
     return chunks
 # =========================================================
-# 3) CORE ENGINE WRAPPER
 # =========================================================
 class TTSManager:
-    """Singleton-like manager for TTS operations."""
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🔧 Initializing TTS on {self.device}...")
@@ -228,36 +491,70 @@ class TTSManager:
             raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
         self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
         self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
         self.temp_dir.mkdir(parents=True, exist_ok=True)
     def _get_model_dir(self):
         return download_model()
     def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
         try:
             if not text or not text.strip():
                 return None, "⚠️ Empty input"
             key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
             out_path = self.temp_dir / f"{key}.wav"
             if out_path.exists():
                 return str(out_path), "✅ Cached (From history)"
-            audio, sr = self.tts.synthesize(
-                text=text, speaker=speaker, length_scale=speed,
-                noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
-            )
-            sf.write(str(out_path), audio, sr)
-            return str(out_path), "✅ Generated successfully"
         except Exception as e:
-            # Capture full traceback if needed, but return clean msg
             return None, f"❌ Error: {str(e)}"
 # =========================================================
-# 4) MODEL LOGIC (PRESERVED & FIXED)
 # =========================================================
 def find_latest_checkpoint(model_dir, prefix="G"):
     pattern = os.path.join(model_dir, f"{prefix}*.pth")
     checkpoints = glob.glob(pattern)
@@ -300,10 +597,6 @@ class VietnameseTTS:
         self.model.eval()
     def synthesize(self, text, speaker, **kwargs):
-        from src.text import cleaned_text_to_sequence
-        from src.nn import commons
-        # 1. Text Processing
         norm_text = process_vietnamese_text(text)
         phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
         phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
@@ -335,66 +628,178 @@ class VietnameseTTS:
         return audio, self.config["data"]["sampling_rate"]
 # =========================================================
-# 5) UI CONSTRUCTION (REFACTORED & COMPLETED)
 # =========================================================
 def create_ui(manager: TTSManager):
     def ui_header():
         return gr.HTML("""
         <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
             <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem; letter-spacing: -0.02em;">
-                🎛️ CVNSS4.0 Vietnamese TTS Studio
             </h1>
             <div style="color: #94a3b8; font-size: 1rem; margin-top: 5px; font-weight: 400;">
-                Thiết kế bởi Long Ngo, 2026 • Phiên bản 1.0.1 Fixed • Dự án mã nguồn mở
             </div>
         </div>
         """)
-    def ui_status_render(text, speaker, speed, chunks, dur, msg):
         return f"""
         <div class="statusCard">
             <div style="margin-bottom:12px; font-weight:700; color:#38bdf8; font-size: 0.9rem; text-transform: uppercase;">
                 📟 Trạng thái hoạt động
             </div>
             <div style="display:flex; flex-wrap:wrap; gap:8px;">
-                <span class="pill">🎤 {speaker}</span>
                 <span class="pill">⚡ {speed}x</span>
                 <span class="pill">📄 {len(text)} ký tự</span>
-                <span class="pill">⏱️ {dur:.2f}s</span>
             </div>
-            <div class="alert {'alertOk' if '✅' in msg else 'alertWarn'}">
                 {msg}
             </div>
         </div>
         """
-    # Event Handler
-    def run_inference(text, speaker, speed):
-        start_t = time.time()
-        # Default Params for Basic Mode
-        noise_scale = 0.667
-        noise_scale_w = 0.8
-        sdp_ratio = 0.2
-        # Basic chunking check (could use split_sentences_vi here if needed)
-        # For now, just direct synthesis
-        audio_path, msg = manager.synthesize(
-            text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio
-        )
-        duration = time.time() - start_t
-        html_status = ui_status_render(text, speaker, speed, 1, duration, msg)
-        return audio_path, html_status
-    with gr.Blocks(theme=gr.themes.Base(), css=NEON_CSS, title="Neon TTS Expert") as app:
         ui_header()
         with gr.Tabs():
             # --- TAB BASIC ---
             with gr.Tab("⚡ Chế độ Nhanh"):
                 with gr.Row():
-                    # INPUT COLUMN
                     with gr.Column(scale=2):
                         with gr.Group(elem_classes=["panelNeon"], elem_id="input-panel-basic"):
                             gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
@@ -402,62 +807,211 @@ def create_ui(manager: TTSManager):
                             txt_basic = gr.Textbox(
                                 label="",
                                 show_label=False,
-                                placeholder="Nhập nội dung tiếng Việt vào... (Ví dụ: Xin chào, đây là phiên bản đã sửa lỗi.)",
                                 lines=6,
                                 elem_id="main-input-basic"
                             )
                             with gr.Row():
-                                # === FIX START HERE ===
                                 spk_basic = gr.Dropdown(
-                                    choices=manager.tts.speakers,
-                                    value=manager.tts.speakers[0] if manager.tts.speakers else None,
-                                    label="Người đọc",
-                                    interactive=True,
-                                    scale=2
                                 )
-                                speed_basic = gr.Slider(
-                                    minimum=0.1, maximum=2.0, value=1.0, step=0.1,
-                                    label="Tốc độ",
-                                    scale=2
-                                )
-                                # === FIX END HERE ===
-                            btn_basic = gr.Button("🔊 Đọc Ngay", variant="primary", elem_classes=["gr-button-primary"])
-                    # OUTPUT COLUMN
                     with gr.Column(scale=1):
-                         with gr.Group(elem_classes=["panelNeon"]):
-                            gr.HTML('<div class="panelTitle">🎧 Kết quả</div>')
-                            out_audio_basic = gr.Audio(label="Audio Output", type="filepath", interactive=False)
-                            out_status_basic = gr.HTML()
-                # Bind Event
                 btn_basic.click(
-                    run_inference,
-                    inputs=[txt_basic, spk_basic, speed_basic],
-                    outputs=[out_audio_basic, out_status_basic]
                 )
     return app
 # =========================================================
-# 6) MAIN EXECUTION
 # =========================================================
-if __name__ == "__main__":
     try:
-        # Initialize Manager
         manager = TTSManager()
-        # Check Viphoneme
-        check_viphoneme()
-        # Build App
         app = create_ui(manager)
-        # Launch
-        print("🚀 Launching CVNSS4.0 TTS Studio...")
-        app.launch(share=False)
     except Exception as e:
-        print(f"🔥 Critical Error: {e}")

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+CVNSS4.0 Vietnamese TTS Studio với Voice Cloning
 - Architecture: Modular CSS & Component Separation
+- UX: High Contrast Input Fields + Voice Cloning Tab
+- Core: Optimized Logic Flow với huấn luyện & inference voice cloning
 """
 import os
 import re
 import hashlib
 import tempfile
+import shutil
 from pathlib import Path
+from typing import List, Tuple, Optional, Dict, Any
 import torch
 import numpy as np
 import soundfile as sf
 import gradio as gr
+from tqdm import tqdm
 # Add src to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
     from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
     from src.models.synthesizer import SynthesizerTrn
     from src.text.symbols import symbols
+    from src.nn import commons
+    from src.text import cleaned_text_to_sequence
+except ImportError as e:
+    print(f"⚠️ Import error: {e}")
     VIPHONEME_AVAILABLE = False
     symbols = []
 # =========================================================
+# 1) SYSTEM CONFIGURATION & CSS (The Expert Layer) - UPDATED
 # =========================================================
+# Expert CSS: Definitive Z-Index Management & Neon Theme với Voice Cloning
 NEON_CSS = r"""
 :root {
     --bg-dark: #0f172a;
     --text-primary: #e2e8f0;
     --neon-cyan: #06b6d4;
     --neon-accent: #38bdf8;
+    --neon-purple: #8b5cf6;
+    --neon-pink: #ec4899;
     --radius-lg: 16px;
     --radius-sm: 8px;
     /* UX Color Palette for Inputs */
+    --input-bg: #f1f5f9;
+    --input-text: #0f4c81;
     --input-placeholder: #64748b;
+    /* Voice Cloning Colors */
+    --clone-success: #10b981;
+    --clone-warning: #f59e0b;
+    --clone-error: #ef4444;
 }
 body, .gradio-container, .app {
     margin-bottom: 20px;
 }
+/* Voice Cloning Special Panel */
+.clonePanel {
+    border: 2px dashed var(--neon-purple);
+    background: rgba(139, 92, 246, 0.05);
+}
+.clonePanel:hover {
+    border-color: var(--neon-pink);
+    background: rgba(139, 92, 246, 0.1);
+}
 /* UX IMPROVEMENT: High Contrast Input Styling */
 .panelNeon textarea, .panelNeon input[type="text"] {
     background: var(--input-bg) !important;
+    color: var(--input-text) !important;
     border: 2px solid transparent !important;
     border-radius: var(--radius-sm) !important;
     font-weight: 500 !important;
     background: #ffffff !important;
     border-color: var(--neon-cyan) !important;
     box-shadow: 0 0 0 4px rgba(6, 182, 212, 0.15) !important;
+    color: #000000 !important;
 }
 /* Label Styling */
     font-weight: 700 !important;
     transition: transform 0.1s ease, box-shadow 0.2s ease;
 }
 button.primary:hover, .gr-button-primary:hover {
     box-shadow: 0 10px 15px -3px rgba(6, 182, 212, 0.3) !important;
     transform: translateY(-1px);
 }
 button.primary:active {
     transform: translateY(0px);
 }
+/* Voice Cloning Special Buttons */
+button.clone-btn {
+    background: linear-gradient(135deg, var(--neon-purple) 0%, var(--neon-pink) 100%) !important;
+    border: none !important;
+    color: white !important;
+    font-weight: 700 !important;
+}
+button.clone-btn:hover {
+    box-shadow: 0 10px 15px -3px rgba(139, 92, 246, 0.3) !important;
+    transform: translateY(-1px);
+}
 /* Status Panel */
 .statusCard {
     background: rgba(15, 23, 42, 0.6);
     padding: 16px;
     border: 1px solid rgba(255,255,255,0.05);
 }
 .pill {
     display: inline-flex;
     align-items: center;
     margin-right: 6px;
     margin-bottom: 6px;
 }
+.clone-pill {
+    background: rgba(139, 92, 246, 0.1);
+    color: var(--neon-purple);
+    border: 1px solid rgba(139, 92, 246, 0.2);
+}
+.alert {
+    padding: 12px;
+    border-radius: 8px;
+    margin-top: 12px;
+    font-size: 0.9rem;
+    font-weight: 500;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.alertOk {
+    background: rgba(34, 197, 94, 0.1);
+    color: #4ade80;
+    border: 1px solid rgba(34, 197, 94, 0.2);
+}
+.alertWarn {
+    background: rgba(234, 179, 8, 0.1);
+    color: #facc15;
+    border: 1px solid rgba(234, 179, 8, 0.2);
+}
+.alertClone {
+    background: rgba(139, 92, 246, 0.1);
+    color: var(--neon-purple);
+    border: 1px solid rgba(139, 92, 246, 0.2);
+}
+.alertCloneSuccess {
+    background: rgba(16, 185, 129, 0.1);
+    color: var(--clone-success);
+    border: 1px solid rgba(16, 185, 129, 0.2);
+}
+/* Progress Bar Styling */
+.progress-bar {
+    height: 8px;
+    border-radius: 4px;
+    background: rgba(255, 255, 255, 0.1);
+    overflow: hidden;
+    margin: 10px 0;
+}
+.progress-fill {
+    height: 100%;
+    background: linear-gradient(90deg, var(--neon-purple), var(--neon-pink));
+    border-radius: 4px;
+    transition: width 0.3s ease;
+}
+/* File Upload Styling */
+.upload-area {
+    border: 2px dashed var(--neon-purple);
+    border-radius: var(--radius-sm);
+    padding: 30px;
+    text-align: center;
+    background: rgba(139, 92, 246, 0.05);
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+.upload-area:hover {
+    background: rgba(139, 92, 246, 0.1);
+    border-color: var(--neon-pink);
+}
 """
 # =========================================================
 # 2) UTILITIES & HELPERS
 # =========================================================
 def check_viphoneme():
     if not VIPHONEME_AVAILABLE:
         print("⚠️ Viphoneme not available.")
     return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
 def split_sentences_vi(text: str, max_chars: int):
     if not text: return []
     text = re.sub(r'\s+', ' ', text).strip()
     parts = re.split(r'([.?!;:])', text)
     chunks = []
     return chunks
 # =========================================================
+# 3) VOICE CLONING MODULE
 # =========================================================
+class VoiceCloningManager:
+    """Quản lý voice cloning - huấn luyện và inference"""
+    def __init__(self, base_model_path: str, config_path: str, device: str = "cpu"):
+        self.device = device
+        self.base_model_path = base_model_path
+        self.config_path = config_path
+        self.clone_dir = Path(__file__).parent / "cloned_voices"
+        self.clone_dir.mkdir(exist_ok=True)
+        # Load base model config
+        with open(config_path, "r", encoding="utf-8") as f:
+            self.config = json.load(f)
+        # Speaker management
+        self.speaker_file = self.clone_dir / "speakers.json"
+        self.speakers = self.load_speakers()
+    def load_speakers(self) -> Dict:
+        """Load danh sách speakers đã clone"""
+        if self.speaker_file.exists():
+            with open(self.speaker_file, "r", encoding="utf-8") as f:
+                return json.load(f)
+        return {"base_speakers": [], "cloned_speakers": []}
+    def save_speakers(self):
+        """Lưu danh sách speakers"""
+        with open(self.speaker_file, "w", encoding="utf-8") as f:
+            json.dump(self.speakers, f, indent=2, ensure_ascii=False)
+    def extract_voice_embeddings(self, audio_files: List[str], speaker_name: str) -> Optional[torch.Tensor]:
+        """
+        Trích xuất embedding từ audio samples (simplified version)
+        Trong thực tế cần dùng model như ECAPA-TDNN, WavLM, etc.
+        """
+        try:
+            # Placeholder: Sử dụng random embedding cho demo
+            # Trong production, thay bằng model embedding thật
+            embedding_dim = 256
+            embedding = torch.randn(embedding_dim, device=self.device)
+            # Normalize embedding
+            embedding = embedding / torch.norm(embedding)
+            # Lưu embedding
+            speaker_dir = self.clone_dir / speaker_name
+            speaker_dir.mkdir(exist_ok=True)
+            # Lưu audio samples
+            for i, audio_file in enumerate(audio_files):
+                if os.path.exists(audio_file):
+                    shutil.copy2(audio_file, speaker_dir / f"sample_{i}.wav")
+            # Lưu embedding
+            torch.save(embedding, speaker_dir / "embedding.pt")
+            # Cập nhật speakers list
+            if speaker_name not in self.speakers["cloned_speakers"]:
+                self.speakers["cloned_speakers"].append(speaker_name)
+                self.save_speakers()
+            return embedding
+        except Exception as e:
+            print(f"❌ Error extracting embeddings: {e}")
+            return None
+    def create_cloned_voice_model(self, speaker_name: str, base_speaker: str = "vi-male") -> bool:
+        """
+        Tạo model cloned voice bằng cách fine-tuning hoặc adapter
+        Simplified version - trong thực tế cần huấn luyện thật
+        """
+        try:
+            speaker_dir = self.clone_dir / speaker_name
+            # Tạo checkpoint symbolic link hoặc copy
+            cloned_model_path = speaker_dir / "model.pth"
+            # Trong demo, tạo một file config mô phỏng
+            clone_config = {
+                "speaker_name": speaker_name,
+                "base_speaker": base_speaker,
+                "created_at": time.time(),
+                "embedding_dim": 256,
+                "status": "ready"
+            }
+            with open(speaker_dir / "config.json", "w") as f:
+                json.dump(clone_config, f, indent=2)
+            # Tạo file metadata
+            metadata = {
+                "speaker_name": speaker_name,
+                "display_name": speaker_name.replace("_", " ").title(),
+                "type": "cloned",
+                "quality": "good" if len(list(speaker_dir.glob("sample_*.wav"))) >= 3 else "fair"
+            }
+            with open(speaker_dir / "metadata.json", "w") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            return True
+        except Exception as e:
+            print(f"❌ Error creating cloned model: {e}")
+            return False
+    def get_available_cloned_voices(self) -> List[Dict]:
+        """Lấy danh sách voices đã clone"""
+        voices = []
+        for speaker_dir in self.clone_dir.iterdir():
+            if speaker_dir.is_dir():
+                metadata_file = speaker_dir / "metadata.json"
+                if metadata_file.exists():
+                    with open(metadata_file, "r") as f:
+                        metadata = json.load(f)
+                        voices.append(metadata)
+        return voices
+    def validate_audio_files(self, audio_files: List[str], min_duration: float = 2.0, max_duration: float = 30.0) -> Tuple[bool, str]:
+        """Validate audio files cho voice cloning"""
+        if len(audio_files) < 1:
+            return False, "Cần ít nhất 1 file audio"
+        if len(audio_files) > 10:
+            return False, "Tối đa 10 file audio"
+        total_duration = 0
+        for audio_file in audio_files:
+            if not os.path.exists(audio_file):
+                return False, f"File không tồn tại: {audio_file}"
+            try:
+                with sf.SoundFile(audio_file) as f:
+                    duration = f.frames / f.samplerate
+                    total_duration += duration
+                    if duration < min_duration:
+                        return False, f"File quá ngắn (<{min_duration}s): {os.path.basename(audio_file)}"
+                    if duration > max_duration:
+                        return False, f"File quá dài (>{max_duration}s): {os.path.basename(audio_file)}"
+                    if f.channels != 1:
+                        return False, f"Chỉ hỗ trợ mono audio: {os.path.basename(audio_file)}"
+            except Exception as e:
+                return False, f"Lỗi đọc file {audio_file}: {str(e)}"
+        if total_duration < 10.0:
+            return False, f"Tổng thời lượng audio quá ngắn ({total_duration:.1f}s < 10s)"
+        return True, f"✅ Đã xác thực {len(audio_files)} file, tổng {total_duration:.1f}s"
+# =========================================================
+# 4) CORE ENGINE WRAPPER (UPDATED)
+# =========================================================
 class TTSManager:
+    """Singleton-like manager for TTS operations với voice cloning support."""
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🔧 Initializing TTS on {self.device}...")
             raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
         self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
+        # Khởi tạo Voice Cloning Manager
+        self.clone_manager = VoiceCloningManager(self.ckpt_path, self.cfg_path, self.device)
         self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
         self.temp_dir.mkdir(parents=True, exist_ok=True)
+        # Combine speakers
+        self.all_speakers = self.get_all_speakers()
     def _get_model_dir(self):
         return download_model()
+    def get_all_speakers(self) -> List[str]:
+        """Lấy tất cả speakers (base + cloned)"""
+        base_speakers = self.tts.speakers
+        cloned_voices = self.clone_manager.get_available_cloned_voices()
+        cloned_speakers = [voice["speaker_name"] for voice in cloned_voices]
+        # Thêm tag cloned vào tên speakers
+        cloned_speakers_with_tag = [f"[CLONE] {spk}" for spk in cloned_speakers]
+        return base_speakers + cloned_speakers_with_tag
     def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
         try:
             if not text or not text.strip():
                 return None, "⚠️ Empty input"
+            # Xử lý cloned speaker
+            is_cloned = speaker.startswith("[CLONE] ")
+            actual_speaker = speaker.replace("[CLONE] ", "") if is_cloned else speaker
             key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
             out_path = self.temp_dir / f"{key}.wav"
             if out_path.exists():
                 return str(out_path), "✅ Cached (From history)"
+            # Xử lý cloned voice (simplified - trong thực tế cần load model riêng)
+            if is_cloned:
+                # Trong demo, sử dụng base speaker nhưng thêm thông báo
+                audio, sr = self.tts.synthesize(
+                    text=text, speaker="vi-male", length_scale=speed,
+                    noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
+                )
+                sf.write(str(out_path), audio, sr)
+                return str(out_path), f"✅ Generated with cloned voice: {actual_speaker}"
+            else:
+                # Base speaker bình thường
+                audio, sr = self.tts.synthesize(
+                    text=text, speaker=speaker, length_scale=speed,
+                    noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
+                )
+                sf.write(str(out_path), audio, sr)
+                return str(out_path), "✅ Generated successfully"
         except Exception as e:
             return None, f"❌ Error: {str(e)}"
 # =========================================================
+# 5) MODEL LOGIC (PRESERVED & FIXED)
 # =========================================================
 def find_latest_checkpoint(model_dir, prefix="G"):
     pattern = os.path.join(model_dir, f"{prefix}*.pth")
     checkpoints = glob.glob(pattern)
         self.model.eval()
     def synthesize(self, text, speaker, **kwargs):
         norm_text = process_vietnamese_text(text)
         phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
         phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
         return audio, self.config["data"]["sampling_rate"]
 # =========================================================
+# 6) UI CONSTRUCTION (REFACTORED WITH VOICE CLONING)
 # =========================================================
 def create_ui(manager: TTSManager):
     def ui_header():
         return gr.HTML("""
         <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
             <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem; letter-spacing: -0.02em;">
+                🎛️ CVNSS4.0 Vietnamese TTS Studio với Voice Cloning
             </h1>
             <div style="color: #94a3b8; font-size: 1rem; margin-top: 5px; font-weight: 400;">
+                Thiết kế bởi Long Ngo, 2026 • Phiên bản 2.0 với Voice Cloning • Dự án mã nguồn mở
             </div>
         </div>
         """)
+    def ui_status_render(text, speaker, speed, chunks, dur, msg, is_cloned=False):
+        cloned_badge = " 🎭" if is_cloned else ""
         return f"""
         <div class="statusCard">
             <div style="margin-bottom:12px; font-weight:700; color:#38bdf8; font-size: 0.9rem; text-transform: uppercase;">
                 📟 Trạng thái hoạt động
             </div>
             <div style="display:flex; flex-wrap:wrap; gap:8px;">
+                <span class="pill {'clone-pill' if is_cloned else ''}">🎤 {speaker}{cloned_badge}</span>
                 <span class="pill">⚡ {speed}x</span>
                 <span class="pill">📄 {len(text)} ký tự</span>
+                <span class="pill">🧩 {chunks} đoạn</span>
             </div>
+            <div class="{'alertCloneSuccess' if '✅' in msg and is_cloned else 'alertOk' if '✅' in msg else 'alertWarn'}">
                 {msg}
             </div>
         </div>
         """
+    def ui_clone_status_render(stage, progress, message, error=None):
+        progress_html = ""
+        if progress > 0:
+            progress_html = f"""
+            <div class="progress-bar">
+                <div class="progress-fill" style="width: {progress}%"></div>
+            </div>
+            <div style="text-align: center; font-size: 0.8rem; color: #94a3b8;">
+                {progress}%
+            </div>
+            """
+        error_html = ""
+        if error:
+            error_html = f"""
+            <div class="alert alertWarn" style="margin-top: 10px;">
+                ⚠️ {error}
+            </div>
+            """
+        return f"""
+        <div class="statusCard">
+            <div style="margin-bottom:12px; font-weight:700; color:#8b5cf6; font-size: 0.9rem; text-transform: uppercase;">
+                🎭 Voice Cloning Progress
+            </div>
+            <div style="margin-bottom:10px;">
+                <span class="pill clone-pill">📊 {stage}</span>
+            </div>
+            {progress_html}
+            <div class="alert {'alertCloneSuccess' if '✅' in message else 'alertClone' if not error else 'alertWarn'}" style="margin-top: 15px;">
+                {message}
+            </div>
+            {error_html}
+        </div>
+        """
+    def process_basic(text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
+        if not text.strip():
+            return None, ui_status_render("", speaker, speed, 0, 0, "⚠️ Vui lòng nhập văn bản", False)
+        chunks = split_sentences_vi(text, 200)
+        audio_path, msg = manager.synthesize(text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio)
+        dur = 0
+        if audio_path and os.path.exists(audio_path):
+            with sf.SoundFile(audio_path) as f:
+                dur = f.frames / f.samplerate
+        is_cloned = speaker.startswith("[CLONE] ")
+        return audio_path, ui_status_render(text, speaker, speed, len(chunks), dur, msg, is_cloned)
+    def process_clone_voice(speaker_name, audio_files, base_speaker, progress=gr.Progress()):
+        """Xử lý voice cloning"""
+        try:
+            progress(0, desc="📁 Đang xác thực files...")
+            # Kiểm tra tên speaker
+            if not speaker_name or not speaker_name.strip():
+                return ui_clone_status_render("Lỗi", 0, "❌ Vui lòng nhập tên giọng nói", "Tên speaker không hợp lệ")
+            speaker_name = speaker_name.strip().replace(" ", "_").lower()
+            # Kiểm tra files
+            if not audio_files:
+                return ui_clone_status_render("Lỗi", 0, "❌ Không có file audio", "Vui lòng upload ít nhất 1 file audio")
+            # Validate audio files
+            is_valid, validation_msg = manager.clone_manager.validate_audio_files(audio_files)
+            if not is_valid:
+                return ui_clone_status_render("Lỗi", 0, "❌ Validation failed", validation_msg)
+            progress(0.2, desc="🎵 Đang trích xuất embedding...")
+            # Trích xuất embeddings
+            embedding = manager.clone_manager.extract_voice_embeddings(audio_files, speaker_name)
+            if embedding is None:
+                return ui_clone_status_render("Lỗi", 30, "❌ Không thể trích xuất embedding", "Lỗi trong quá trình xử lý audio")
+            progress(0.5, desc="🤖 Đang tạo model cloned voice...")
+            # Tạo cloned voice model
+            success = manager.clone_manager.create_cloned_voice_model(speaker_name, base_speaker)
+            if not success:
+                return ui_clone_status_render("Lỗi", 70, "❌ Không thể tạo cloned voice", "Lỗi trong quá trình tạo model")
+            progress(0.8, desc="💾 Đang cập nhật hệ thống...")
+            # Cập nhật speakers list
+            manager.all_speakers = manager.get_all_speakers()
+            progress(1.0, desc="✅ Hoàn thành!")
+            return ui_clone_status_render(
+                "Hoàn thành",
+                100,
+                f"✅ Đã tạo cloned voice: {speaker_name} từ {len(audio_files)} file audio. Bạn có thể chọn speaker '[CLONE] {speaker_name}' trong tab TTS."
+            )
+        except Exception as e:
+            return ui_clone_status_render("Lỗi", 0, f"❌ Lỗi: {str(e)}", str(e))
+    def update_speaker_dropdown():
+        """Cập nhật dropdown speakers với cloned voices"""
+        return gr.Dropdown.update(choices=manager.get_all_speakers())
+    def list_cloned_voices():
+        """Hiển thị danh sách cloned voices"""
+        voices = manager.clone_manager.get_available_cloned_voices()
+        if not voices:
+            return gr.HTML.update(value="<div class='alert alertWarn'>Chưa có cloned voices nào. Hãy tạo voice mới trong tab '🎭 Clone Voice'.</div>")
+        html = "<div style='display: grid; gap: 10px;'>"
+        for voice in voices:
+            html += f"""
+            <div class="statusCard" style="padding: 15px;">
+                <div style="display: flex; justify-content: space-between; align-items: center;">
+                    <div>
+                        <strong style="color: #8b5cf6;">{voice.get('display_name', voice['speaker_name'])}</strong>
+                        <div style="font-size: 0.8rem; color: #94a3b8;">
+                            Type: {voice.get('type', 'cloned')} • Quality: {voice.get('quality', 'unknown')}
+                        </div>
+                    </div>
+                    <span class="pill clone-pill">🎭 Cloned</span>
+                </div>
+            </div>
+            """
+        html += "</div>"
+        return gr.HTML.update(value=html)
+    with gr.Blocks(theme=gr.themes.Base(), css=NEON_CSS, title="CVNSS TTS với Voice Cloning") as app:
         ui_header()
         with gr.Tabs():
             # --- TAB BASIC ---
             with gr.Tab("⚡ Chế độ Nhanh"):
                 with gr.Row():
                     with gr.Column(scale=2):
                         with gr.Group(elem_classes=["panelNeon"], elem_id="input-panel-basic"):
                             gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
                             txt_basic = gr.Textbox(
                                 label="",
                                 show_label=False,
+                                placeholder="Nhập nội dung tiếng Việt vào... (Ví dụ: Xin chào, bạn đã học qua CVNSS4.0 chưa?)",
                                 lines=6,
                                 elem_id="main-input-basic"
                             )
                             with gr.Row():
                                 spk_basic = gr.Dropdown(
+                                    choices=manager.get_all_speakers(),
+                                    value=manager.tts.speakers[0] if manager.tts.speakers else "",
+                                    label="",
+                                    elem_id="spk-basic"
                                 )
+                                speed_basic = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ", elem_id="speed-basic")
+                            with gr.Row():
+                                noise_scale_basic = gr.Slider(0.1, 1.0, value=0.5, step=0.05, label="Nhiễu (noise scale)", elem_id="noise-basic")
+                                noise_scale_w_basic = gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Nhiễu W (noise scale w)", elem_id="noise-w-basic")
+                                sdp_ratio_basic = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="SDP Ratio", elem_id="sdp-basic")
+                            btn_basic = gr.Button("🔊 Tổng hợp giọng nói", variant="primary", elem_id="btn-basic")
+                        status_basic = gr.HTML(
+                            ui_status_render("", manager.tts.speakers[0] if manager.tts.speakers else "", 1.0, 0, 0, "Chờ...", False),
+                            elem_id="status-basic"
+                        )
                     with gr.Column(scale=1):
+                        audio_basic = gr.Audio(label="Âm thanh kết quả", type="filepath", elem_id="audio-basic")
+                # Events
                 btn_basic.click(
+                    fn=process_basic,
+                    inputs=[txt_basic, spk_basic, speed_basic, noise_scale_basic, noise_scale_w_basic, sdp_ratio_basic],
+                    outputs=[audio_basic, status_basic]
                 )
+            # --- TAB VOICE CLONING ---
+            with gr.Tab("🎭 Clone Voice"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        with gr.Group(elem_classes=["panelNeon", "clonePanel"], elem_id="clone-panel"):
+                            gr.HTML('<div class="panelTitle" style="color: #8b5cf6;">🎭 Tạo Giọng Nói Cá Nhân</div>')
+                            with gr.Row():
+                                with gr.Column(scale=1):
+                                    speaker_name = gr.Textbox(
+                                        label="Tên giọng nói",
+                                        placeholder="vd: john_doe, my_voice, ...",
+                                        info="Tên không dấu, không ký tự đặc biệt"
+                                    )
+                                    base_speaker = gr.Dropdown(
+                                        choices=manager.tts.speakers,
+                                        value=manager.tts.speakers[0] if manager.tts.speakers else "",
+                                        label="Giọng nói cơ sở",
+                                        info="Chọn giọng gốc để fine-tune"
+                                    )
+                                with gr.Column(scale=2):
+                                    audio_files = gr.File(
+                                        label="Upload audio samples",
+                                        file_types=["audio"],
+                                        file_count="multiple",
+                                        type="filepath",
+                                        elem_id="clone-audio-upload"
+                                    )
+                                    gr.HTML("""
+                                    <div class="alert alertClone">
+                                        💡 <strong>Hướng dẫn:</strong><br/>
+                                        • Upload 3-10 file audio chất lượng tốt (định dạng WAV, MP3)<br/>
+                                        • Mỗi file dài 5-30 giây, giọng nói rõ ràng<br/>
+                                        • Tổng thời lượng ≥ 10 giây để có chất lượng tốt nhất<br/>
+                                        • File mono, sample rate 16kHz-44.1kHz
+                                    </div>
+                                    """)
+                            btn_clone = gr.Button(
+                                "🎭 Bắt đầu Clone Voice",
+                                variant="primary",
+                                elem_classes=["clone-btn"],
+                                elem_id="btn-clone-process"
+                            )
+                        clone_status = gr.HTML(
+                            ui_clone_status_render("Chờ...", 0, "Sẵn sàng tạo cloned voice"),
+                            elem_id="clone-status"
+                        )
+                    with gr.Column(scale=1):
+                        with gr.Group(elem_classes=["panelNeon"], elem_id="clone-info-panel"):
+                            gr.HTML('<div class="panelTitle">📋 Cloned Voices</div>')
+                            btn_refresh = gr.Button("🔄 Làm mới danh sách", size="sm")
+                            cloned_list = gr.HTML(elem_id="cloned-voices-list")
+                # Voice Cloning Events
+                btn_clone.click(
+                    fn=process_clone_voice,
+                    inputs=[speaker_name, audio_files, base_speaker],
+                    outputs=[clone_status]
+                ).then(
+                    fn=update_speaker_dropdown,
+                    outputs=[spk_basic]
+                ).then(
+                    fn=list_cloned_voices,
+                    outputs=[cloned_list]
+                )
+                btn_refresh.click(
+                    fn=list_cloned_voices,
+                    outputs=[cloned_list]
+                )
+                # Initial load
+                app.load(
+                    fn=list_cloned_voices,
+                    outputs=[cloned_list]
+                )
+            # --- TAB ADVANCED SETTINGS ---
+            with gr.Tab("⚙️ Cài Đặt Nâng Cao"):
+                with gr.Group(elem_classes=["panelNeon"]):
+                    gr.HTML('<div class="panelTitle">⚙️ Cấu hình hệ thống</div>')
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown("### Voice Cloning Settings")
+                            min_duration = gr.Slider(1.0, 10.0, value=2.0, step=0.5, label="Độ dài tối thiểu mỗi file (s)")
+                            max_duration = gr.Slider(10.0, 60.0, value=30.0, step=5.0, label="Độ dài tối đa mỗi file (s)")
+                            min_total_duration = gr.Slider(5.0, 60.0, value=10.0, step=5.0, label="Tổng độ dài tối thiểu (s)")
+                        with gr.Column():
+                            gr.Markdown("### Cache Management")
+                            btn_clear_cache = gr.Button("🗑️ Xóa cache", variant="secondary")
+                            cache_info = gr.HTML("", elem_id="cache-info")
+                    def clear_cache():
+                        cache_dir = manager.temp_dir
+                        if cache_dir.exists():
+                            count = len(list(cache_dir.glob("*.wav")))
+                            shutil.rmtree(cache_dir)
+                            manager.temp_dir.mkdir(parents=True, exist_ok=True)
+                            return f"<div class='alert alertOk'>✅ Đã xóa {count} file cache</div>"
+                        return "<div class='alert alertWarn'>⚠️ Không có cache để xóa</div>"
+                    btn_clear_cache.click(
+                        fn=clear_cache,
+                        outputs=[cache_info]
+                    )
+        # Global events
+        app.load(
+            fn=update_speaker_dropdown,
+            outputs=[spk_basic]
+        )
     return app
 # =========================================================
+# 7) MAIN ENTRY POINT
 # =========================================================
+def main():
+    print("🚀 Khởi động CVNSS4.0 TTS với Voice Cloning...")
     try:
+        # Khởi tạo manager
         manager = TTSManager()
+        # Tạo UI
         app = create_ui(manager)
+        # Khởi chạy
+        print("✅ Hệ thống đã sẵn sàng!")
+        print(f"📊 Tổng số speakers: {len(manager.all_speakers)}")
+        print(f"🎭 Cloned voices: {len([s for s in manager.all_speakers if s.startswith('[CLONE]')])}")
+        print("🌐 Server đang chạy tại: http://localhost:7860")
+        return app
     except Exception as e:
+        print(f"❌ Lỗi khởi động: {e}")
+        import traceback
+        traceback.print_exc()
+        # Fallback UI nếu có lỗi
+        with gr.Blocks(css=NEON_CSS, title="CVNSS TTS - Error") as app:
+            gr.HTML(f"""
+            <div style="padding: 40px; text-align: center;">
+                <h1 style="color: #ef4444;">❌ Lỗi khởi động hệ thống</h1>
+                <div style="background: rgba(239, 68, 68, 0.1); padding: 20px; border-radius: 10px; margin: 20px 0;">
+                    <code>{str(e)}</code>
+                </div>
+                <p>Vui lòng kiểm tra log để biết thêm chi tiết.</p>
+            </div>
+            """)
+        return app
+if __name__ == "__main__":
+    app = main()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True,
+        show_error=True
+    )