TTS-Demo

Sleeping

App Files Files Community

CVNSS commited on 16 days ago

Commit

7c8d39b

verified ·

1 Parent(s): 3cd1c26

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -774

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-CVNSS4.0 Vietnamese TTS Studio với Voice Cloning
-- Architecture: Modular CSS & Component Separation
-- UX: High Contrast Input Fields + Voice Cloning Tab
-- Core: Optimized Logic Flow với huấn luyện & inference voice cloning
 """
 import os
@@ -15,37 +15,67 @@ import glob
 import re
 import hashlib
 import tempfile
 import shutil
 from pathlib import Path
-from typing import List, Tuple, Optional, Dict, Any
 import torch
 import numpy as np
 import soundfile as sf
 import gradio as gr
-from tqdm import tqdm
-# Add src to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
-# Import core modules
 try:
     from src.vietnamese.text_processor import process_vietnamese_text
     from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
     from src.models.synthesizer import SynthesizerTrn
     from src.text.symbols import symbols
-    from src.nn import commons
-    from src.text import cleaned_text_to_sequence
 except ImportError as e:
-    print(f"⚠️ Import error: {e}")
     VIPHONEME_AVAILABLE = False
     symbols = []
 # =========================================================
-# 1) SYSTEM CONFIGURATION & CSS (The Expert Layer) - UPDATED
 # =========================================================
-# Expert CSS: Definitive Z-Index Management & Neon Theme với Voice Cloning
 NEON_CSS = r"""
 :root {
     --bg-dark: #0f172a;
@@ -54,29 +84,17 @@ NEON_CSS = r"""
     --text-primary: #e2e8f0;
     --neon-cyan: #06b6d4;
     --neon-accent: #38bdf8;
-    --neon-purple: #8b5cf6;
-    --neon-pink: #ec4899;
     --radius-lg: 16px;
     --radius-sm: 8px;
-    /* UX Color Palette for Inputs */
     --input-bg: #f1f5f9;
     --input-text: #0f4c81;
     --input-placeholder: #64748b;
-    /* Voice Cloning Colors */
-    --clone-success: #10b981;
-    --clone-warning: #f59e0b;
-    --clone-error: #ef4444;
 }
 body, .gradio-container, .app {
     background: radial-gradient(circle at 50% 0%, #1e293b 0%, #0f172a 100%) !important;
     color: var(--text-primary) !important;
     font-family: 'Inter', 'Segoe UI', sans-serif;
 }
-/* --- ISOLATION FULL: CVNSS4.0 Vietnamese TTS Studio --- */
 .panelNeon {
     border: 1px solid rgba(255,255,255,0.08);
     border-radius: var(--radius-lg);
@@ -84,24 +102,8 @@ body, .gradio-container, .app {
     backdrop-filter: blur(12px);
     box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
     padding: 20px;
-    position: relative;
-    isolation: isolate;
-    z-index: 1;
     margin-bottom: 20px;
 }
-/* Voice Cloning Special Panel */
-.clonePanel {
-    border: 2px dashed var(--neon-purple);
-    background: rgba(139, 92, 246, 0.05);
-}
-.clonePanel:hover {
-    border-color: var(--neon-pink);
-    background: rgba(139, 92, 246, 0.1);
-}
-/* UX IMPROVEMENT: High Contrast Input Styling */
 .panelNeon textarea, .panelNeon input[type="text"] {
     background: var(--input-bg) !important;
     color: var(--input-text) !important;
@@ -109,79 +111,20 @@ body, .gradio-container, .app {
     border-radius: var(--radius-sm) !important;
     font-weight: 500 !important;
     font-size: 1rem !important;
-    line-height: 1.5 !important;
     padding: 12px !important;
-    transition: all 0.2s ease;
-    z-index: 10 !important;
-    position: relative !important;
-}
-.panelNeon textarea::placeholder {
-    color: var(--input-placeholder) !important;
 }
-.panelNeon textarea:focus, .panelNeon input:focus {
-    background: #ffffff !important;
-    border-color: var(--neon-cyan) !important;
-    box-shadow: 0 0 0 4px rgba(6, 182, 212, 0.15) !important;
-    color: #000000 !important;
-}
-/* Label Styling */
-.panelNeon label span {
-    color: var(--neon-accent) !important;
-    font-weight: 600;
-    font-size: 0.85rem;
-    text-transform: uppercase;
-    letter-spacing: 0.05em;
-    margin-bottom: 8px;
-    display: block;
-}
-/* Dropdown & Slider fixes */
-.panelNeon .wrap, .panelNeon .range-compact {
-    z-index: 10 !important;
-}
-/* Button Upgrades */
 button.primary, .gr-button-primary {
     background: linear-gradient(135deg, #06b6d4 0%, #3b82f6 100%) !important;
     border: none !important;
     color: white !important;
     font-weight: 700 !important;
-    transition: transform 0.1s ease, box-shadow 0.2s ease;
-}
-button.primary:hover, .gr-button-primary:hover {
-    box-shadow: 0 10px 15px -3px rgba(6, 182, 212, 0.3) !important;
-    transform: translateY(-1px);
-}
-button.primary:active {
-    transform: translateY(0px);
-}
-/* Voice Cloning Special Buttons */
-button.clone-btn {
-    background: linear-gradient(135deg, var(--neon-purple) 0%, var(--neon-pink) 100%) !important;
-    border: none !important;
-    color: white !important;
-    font-weight: 700 !important;
-}
-button.clone-btn:hover {
-    box-shadow: 0 10px 15px -3px rgba(139, 92, 246, 0.3) !important;
-    transform: translateY(-1px);
 }
-/* Status Panel */
 .statusCard {
     background: rgba(15, 23, 42, 0.6);
     border-radius: var(--radius-sm);
     padding: 16px;
     border: 1px solid rgba(255,255,255,0.05);
 }
 .pill {
     display: inline-flex;
     align-items: center;
@@ -193,296 +136,36 @@ button.clone-btn:hover {
     font-size: 0.8rem;
     font-weight: 600;
     margin-right: 6px;
-    margin-bottom: 6px;
-}
-.clone-pill {
-    background: rgba(139, 92, 246, 0.1);
-    color: var(--neon-purple);
-    border: 1px solid rgba(139, 92, 246, 0.2);
-}
-.alert {
-    padding: 12px;
-    border-radius: 8px;
-    margin-top: 12px;
-    font-size: 0.9rem;
-    font-weight: 500;
-    display: flex;
-    align-items: center;
-    gap: 8px;
-}
-.alertOk {
-    background: rgba(34, 197, 94, 0.1);
-    color: #4ade80;
-    border: 1px solid rgba(34, 197, 94, 0.2);
-}
-.alertWarn {
-    background: rgba(234, 179, 8, 0.1);
-    color: #facc15;
-    border: 1px solid rgba(234, 179, 8, 0.2);
-}
-.alertClone {
-    background: rgba(139, 92, 246, 0.1);
-    color: var(--neon-purple);
-    border: 1px solid rgba(139, 92, 246, 0.2);
-}
-.alertCloneSuccess {
-    background: rgba(16, 185, 129, 0.1);
-    color: var(--clone-success);
-    border: 1px solid rgba(16, 185, 129, 0.2);
-}
-/* Progress Bar Styling */
-.progress-bar {
-    height: 8px;
-    border-radius: 4px;
-    background: rgba(255, 255, 255, 0.1);
-    overflow: hidden;
-    margin: 10px 0;
-}
-.progress-fill {
-    height: 100%;
-    background: linear-gradient(90deg, var(--neon-purple), var(--neon-pink));
-    border-radius: 4px;
-    transition: width 0.3s ease;
-}
-/* File Upload Styling */
-.upload-area {
-    border: 2px dashed var(--neon-purple);
-    border-radius: var(--radius-sm);
-    padding: 30px;
-    text-align: center;
-    background: rgba(139, 92, 246, 0.05);
-    cursor: pointer;
-    transition: all 0.3s ease;
-}
-.upload-area:hover {
-    background: rgba(139, 92, 246, 0.1);
-    border-color: var(--neon-pink);
 }
 """
 # =========================================================
 # 2) UTILITIES & HELPERS
 # =========================================================
 def check_viphoneme():
     if not VIPHONEME_AVAILABLE:
         print("⚠️ Viphoneme not available.")
         return False
-    try:
-        phones, _, _ = text_to_phonemes("Test", use_viphoneme=True)
-        print("✅ Viphoneme active.")
-        return True
-    except Exception as e:
-        print(f"❌ Viphoneme error: {e}")
-        return False
 def md5_key(*parts: str) -> str:
     return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
-def split_sentences_vi(text: str, max_chars: int):
-    if not text: return []
-    text = re.sub(r'\s+', ' ', text).strip()
-    parts = re.split(r'([.?!;:])', text)
-    chunks = []
-    current_chunk = ""
-    for i in range(0, len(parts) - 1, 2):
-        sentence = parts[i] + parts[i+1]
-        if len(current_chunk) + len(sentence) <= max_chars:
-            current_chunk += sentence
-        else:
-            if current_chunk: chunks.append(current_chunk.strip())
-            current_chunk = sentence
-    if len(parts) % 2 != 0 and parts[-1]:
-        sentence = parts[-1]
-        if len(current_chunk) + len(sentence) <= max_chars:
-            current_chunk += sentence
-        else:
-            if current_chunk: chunks.append(current_chunk.strip())
-            current_chunk = sentence
-    if current_chunk: chunks.append(current_chunk.strip())
-    return chunks
-# =========================================================
-# 3) VOICE CLONING MODULE
-# =========================================================
-class VoiceCloningManager:
-    """Quản lý voice cloning - huấn luyện và inference"""
-    def __init__(self, base_model_path: str, config_path: str, device: str = "cpu"):
-        self.device = device
-        self.base_model_path = base_model_path
-        self.config_path = config_path
-        self.clone_dir = Path(__file__).parent / "cloned_voices"
-        self.clone_dir.mkdir(exist_ok=True)
-        # Load base model config
-        with open(config_path, "r", encoding="utf-8") as f:
-            self.config = json.load(f)
-        # Speaker management
-        self.speaker_file = self.clone_dir / "speakers.json"
-        self.speakers = self.load_speakers()
-    def load_speakers(self) -> Dict:
-        """Load danh sách speakers đã clone"""
-        if self.speaker_file.exists():
-            with open(self.speaker_file, "r", encoding="utf-8") as f:
-                return json.load(f)
-        return {"base_speakers": [], "cloned_speakers": []}
-    def save_speakers(self):
-        """Lưu danh sách speakers"""
-        with open(self.speaker_file, "w", encoding="utf-8") as f:
-            json.dump(self.speakers, f, indent=2, ensure_ascii=False)
-    def extract_voice_embeddings(self, audio_files: List[str], speaker_name: str) -> Optional[torch.Tensor]:
-        """
-        Trích xuất embedding từ audio samples (simplified version)
-        Trong thực tế cần dùng model như ECAPA-TDNN, WavLM, etc.
-        """
-        try:
-            # Placeholder: Sử dụng random embedding cho demo
-            # Trong production, thay bằng model embedding thật
-            embedding_dim = 256
-            embedding = torch.randn(embedding_dim, device=self.device)
-            # Normalize embedding
-            embedding = embedding / torch.norm(embedding)
-            # Lưu embedding
-            speaker_dir = self.clone_dir / speaker_name
-            speaker_dir.mkdir(exist_ok=True)
-            # Lưu audio samples
-            for i, audio_file in enumerate(audio_files):
-                if os.path.exists(audio_file):
-                    shutil.copy2(audio_file, speaker_dir / f"sample_{i}.wav")
-            # Lưu embedding
-            torch.save(embedding, speaker_dir / "embedding.pt")
-            # Cập nhật speakers list
-            if speaker_name not in self.speakers["cloned_speakers"]:
-                self.speakers["cloned_speakers"].append(speaker_name)
-                self.save_speakers()
-            return embedding
-        except Exception as e:
-            print(f"❌ Error extracting embeddings: {e}")
-            return None
-    def create_cloned_voice_model(self, speaker_name: str, base_speaker: str = "vi-male") -> bool:
-        """
-        Tạo model cloned voice bằng cách fine-tuning hoặc adapter
-        Simplified version - trong thực tế cần huấn luyện thật
-        """
-        try:
-            speaker_dir = self.clone_dir / speaker_name
-            # Tạo checkpoint symbolic link hoặc copy
-            cloned_model_path = speaker_dir / "model.pth"
-            # Trong demo, tạo một file config mô phỏng
-            clone_config = {
-                "speaker_name": speaker_name,
-                "base_speaker": base_speaker,
-                "created_at": time.time(),
-                "embedding_dim": 256,
-                "status": "ready"
-            }
-            with open(speaker_dir / "config.json", "w") as f:
-                json.dump(clone_config, f, indent=2)
-            # Tạo file metadata
-            metadata = {
-                "speaker_name": speaker_name,
-                "display_name": speaker_name.replace("_", " ").title(),
-                "type": "cloned",
-                "quality": "good" if len(list(speaker_dir.glob("sample_*.wav"))) >= 3 else "fair"
-            }
-            with open(speaker_dir / "metadata.json", "w") as f:
-                json.dump(metadata, f, indent=2, ensure_ascii=False)
-            return True
-        except Exception as e:
-            print(f"❌ Error creating cloned model: {e}")
-            return False
-    def get_available_cloned_voices(self) -> List[Dict]:
-        """Lấy danh sách voices đã clone"""
-        voices = []
-        for speaker_dir in self.clone_dir.iterdir():
-            if speaker_dir.is_dir():
-                metadata_file = speaker_dir / "metadata.json"
-                if metadata_file.exists():
-                    with open(metadata_file, "r") as f:
-                        metadata = json.load(f)
-                        voices.append(metadata)
-        return voices
-    def validate_audio_files(self, audio_files: List[str], min_duration: float = 2.0, max_duration: float = 30.0) -> Tuple[bool, str]:
-        """Validate audio files cho voice cloning"""
-        if len(audio_files) < 1:
-            return False, "Cần ít nhất 1 file audio"
-        if len(audio_files) > 10:
-            return False, "Tối đa 10 file audio"
-        total_duration = 0
-        for audio_file in audio_files:
-            if not os.path.exists(audio_file):
-                return False, f"File không tồn tại: {audio_file}"
-            try:
-                with sf.SoundFile(audio_file) as f:
-                    duration = f.frames / f.samplerate
-                    total_duration += duration
-                    if duration < min_duration:
-                        return False, f"File quá ngắn (<{min_duration}s): {os.path.basename(audio_file)}"
-                    if duration > max_duration:
-                        return False, f"File quá dài (>{max_duration}s): {os.path.basename(audio_file)}"
-                    if f.channels != 1:
-                        return False, f"Chỉ hỗ trợ mono audio: {os.path.basename(audio_file)}"
-            except Exception as e:
-                return False, f"Lỗi đọc file {audio_file}: {str(e)}"
-        if total_duration < 10.0:
-            return False, f"Tổng thời lượng audio quá ngắn ({total_duration:.1f}s < 10s)"
-        return True, f"✅ Đã xác thực {len(audio_files)} file, tổng {total_duration:.1f}s"
 # =========================================================
-# 4) CORE ENGINE WRAPPER (UPDATED)
 # =========================================================
 class TTSManager:
-    """Singleton-like manager for TTS operations với voice cloning support."""
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🔧 Initializing TTS on {self.device}...")
         self.model_dir = self._get_model_dir()
         self.ckpt_path = find_latest_checkpoint(self.model_dir, "G")
         self.cfg_path = os.path.join(self.model_dir, "config.json")
@@ -491,70 +174,35 @@ class TTSManager:
             raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
         self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
-        # Khởi tạo Voice Cloning Manager
-        self.clone_manager = VoiceCloningManager(self.ckpt_path, self.cfg_path, self.device)
         self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
         self.temp_dir.mkdir(parents=True, exist_ok=True)
-        # Combine speakers
-        self.all_speakers = self.get_all_speakers()
     def _get_model_dir(self):
         return download_model()
-    def get_all_speakers(self) -> List[str]:
-        """Lấy tất cả speakers (base + cloned)"""
-        base_speakers = self.tts.speakers
-        cloned_voices = self.clone_manager.get_available_cloned_voices()
-        cloned_speakers = [voice["speaker_name"] for voice in cloned_voices]
-        # Thêm tag cloned vào tên speakers
-        cloned_speakers_with_tag = [f"[CLONE] {spk}" for spk in cloned_speakers]
-        return base_speakers + cloned_speakers_with_tag
     def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
         try:
             if not text or not text.strip():
                 return None, "⚠️ Empty input"
-            # Xử lý cloned speaker
-            is_cloned = speaker.startswith("[CLONE] ")
-            actual_speaker = speaker.replace("[CLONE] ", "") if is_cloned else speaker
             key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
             out_path = self.temp_dir / f"{key}.wav"
             if out_path.exists():
                 return str(out_path), "✅ Cached (From history)"
-            # Xử lý cloned voice (simplified - trong thực tế cần load model riêng)
-            if is_cloned:
-                # Trong demo, sử dụng base speaker nhưng thêm thông báo
-                audio, sr = self.tts.synthesize(
-                    text=text, speaker="vi-male", length_scale=speed,
-                    noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
-                )
-                sf.write(str(out_path), audio, sr)
-                return str(out_path), f"✅ Generated with cloned voice: {actual_speaker}"
-            else:
-                # Base speaker bình thường
-                audio, sr = self.tts.synthesize(
-                    text=text, speaker=speaker, length_scale=speed,
-                    noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
-                )
-                sf.write(str(out_path), audio, sr)
-                return str(out_path), "✅ Generated successfully"
         except Exception as e:
             return None, f"❌ Error: {str(e)}"
 # =========================================================
-# 5) MODEL LOGIC (PRESERVED & FIXED)
 # =========================================================
 def find_latest_checkpoint(model_dir, prefix="G"):
     pattern = os.path.join(model_dir, f"{prefix}*.pth")
     checkpoints = glob.glob(pattern)
@@ -597,6 +245,9 @@ class VietnameseTTS:
         self.model.eval()
     def synthesize(self, text, speaker, **kwargs):
         norm_text = process_vietnamese_text(text)
         phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
         phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
@@ -605,413 +256,92 @@ class VietnameseTTS:
         tone_ids = commons.intersperse(tone_ids, 0)
         lang_ids = commons.intersperse(lang_ids, 0)
-        # 2. Prepare Tensors
         x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device)
         x_len = torch.LongTensor([len(phone_ids)]).to(self.device)
         tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device)
         lang = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device)
         sid = torch.LongTensor([self.spk2id.get(speaker, 0)]).to(self.device)
-        # 3. Inference with Gradient Safety
         with torch.no_grad():
             bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device)
             ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device)
-            outputs = self.model.infer(
-                x, x_len, sid, tone, lang,
-                bert, ja_bert,
-                **kwargs
-            )
             audio = outputs[0][0,0].detach().cpu().numpy()
         return audio, self.config["data"]["sampling_rate"]
 # =========================================================
-# 6) UI CONSTRUCTION (REFACTORED WITH VOICE CLONING)
 # =========================================================
 def create_ui(manager: TTSManager):
     def ui_header():
         return gr.HTML("""
         <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
-            <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem; letter-spacing: -0.02em;">
-                🎛️ CVNSS4.0 Vietnamese TTS Studio với Voice Cloning
-            </h1>
-            <div style="color: #94a3b8; font-size: 1rem; margin-top: 5px; font-weight: 400;">
-                Thiết kế bởi Long Ngo, 2026 • Phiên bản 2.0 với Voice Cloning • Dự án mã nguồn mở
-            </div>
         </div>
         """)
-    def ui_status_render(text, speaker, speed, chunks, dur, msg, is_cloned=False):
-        cloned_badge = " 🎭" if is_cloned else ""
         return f"""
         <div class="statusCard">
-            <div style="margin-bottom:12px; font-weight:700; color:#38bdf8; font-size: 0.9rem; text-transform: uppercase;">
-                📟 Trạng thái hoạt động
-            </div>
             <div style="display:flex; flex-wrap:wrap; gap:8px;">
-                <span class="pill {'clone-pill' if is_cloned else ''}">🎤 {speaker}{cloned_badge}</span>
                 <span class="pill">⚡ {speed}x</span>
-                <span class="pill">📄 {len(text)} ký tự</span>
-                <span class="pill">🧩 {chunks} đoạn</span>
-            </div>
-            <div class="{'alertCloneSuccess' if '✅' in msg and is_cloned else 'alertOk' if '✅' in msg else 'alertWarn'}">
-                {msg}
-            </div>
-        </div>
-        """
-    def ui_clone_status_render(stage, progress, message, error=None):
-        progress_html = ""
-        if progress > 0:
-            progress_html = f"""
-            <div class="progress-bar">
-                <div class="progress-fill" style="width: {progress}%"></div>
             </div>
-            <div style="text-align: center; font-size: 0.8rem; color: #94a3b8;">
-                {progress}%
-            </div>
-            """
-        error_html = ""
-        if error:
-            error_html = f"""
-            <div class="alert alertWarn" style="margin-top: 10px;">
-                ⚠️ {error}
-            </div>
-            """
-        return f"""
-        <div class="statusCard">
-            <div style="margin-bottom:12px; font-weight:700; color:#8b5cf6; font-size: 0.9rem; text-transform: uppercase;">
-                🎭 Voice Cloning Progress
-            </div>
-            <div style="margin-bottom:10px;">
-                <span class="pill clone-pill">📊 {stage}</span>
-            </div>
-            {progress_html}
-            <div class="alert {'alertCloneSuccess' if '✅' in message else 'alertClone' if not error else 'alertWarn'}" style="margin-top: 15px;">
-                {message}
-            </div>
-            {error_html}
         </div>
         """
-    def process_basic(text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
-        if not text.strip():
-            return None, ui_status_render("", speaker, speed, 0, 0, "⚠️ Vui lòng nhập văn bản", False)
-        chunks = split_sentences_vi(text, 200)
-        audio_path, msg = manager.synthesize(text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio)
-        dur = 0
-        if audio_path and os.path.exists(audio_path):
-            with sf.SoundFile(audio_path) as f:
-                dur = f.frames / f.samplerate
-        is_cloned = speaker.startswith("[CLONE] ")
-        return audio_path, ui_status_render(text, speaker, speed, len(chunks), dur, msg, is_cloned)
-    def process_clone_voice(speaker_name, audio_files, base_speaker, progress=gr.Progress()):
-        """Xử lý voice cloning"""
-        try:
-            progress(0, desc="📁 Đang xác thực files...")
-            # Kiểm tra tên speaker
-            if not speaker_name or not speaker_name.strip():
-                return ui_clone_status_render("Lỗi", 0, "❌ Vui lòng nhập tên giọng nói", "Tên speaker không hợp lệ")
-            speaker_name = speaker_name.strip().replace(" ", "_").lower()
-            # Kiểm tra files
-            if not audio_files:
-                return ui_clone_status_render("Lỗi", 0, "❌ Không có file audio", "Vui lòng upload ít nhất 1 file audio")
-            # Validate audio files
-            is_valid, validation_msg = manager.clone_manager.validate_audio_files(audio_files)
-            if not is_valid:
-                return ui_clone_status_render("Lỗi", 0, "❌ Validation failed", validation_msg)
-            progress(0.2, desc="🎵 Đang trích xuất embedding...")
-            # Trích xuất embeddings
-            embedding = manager.clone_manager.extract_voice_embeddings(audio_files, speaker_name)
-            if embedding is None:
-                return ui_clone_status_render("Lỗi", 30, "❌ Không thể trích xuất embedding", "Lỗi trong quá trình xử lý audio")
-            progress(0.5, desc="🤖 Đang tạo model cloned voice...")
-            # Tạo cloned voice model
-            success = manager.clone_manager.create_cloned_voice_model(speaker_name, base_speaker)
-            if not success:
-                return ui_clone_status_render("Lỗi", 70, "❌ Không thể tạo cloned voice", "Lỗi trong quá trình tạo model")
-            progress(0.8, desc="💾 Đang cập nhật hệ thống...")
-            # Cập nhật speakers list
-            manager.all_speakers = manager.get_all_speakers()
-            progress(1.0, desc="✅ Hoàn thành!")
-            return ui_clone_status_render(
-                "Hoàn thành",
-                100,
-                f"✅ Đã tạo cloned voice: {speaker_name} từ {len(audio_files)} file audio. Bạn có thể chọn speaker '[CLONE] {speaker_name}' trong tab TTS."
-            )
-        except Exception as e:
-            return ui_clone_status_render("Lỗi", 0, f"❌ Lỗi: {str(e)}", str(e))
-    def update_speaker_dropdown():
-        """Cập nhật dropdown speakers với cloned voices"""
-        return gr.Dropdown.update(choices=manager.get_all_speakers())
-    def list_cloned_voices():
-        """Hiển thị danh sách cloned voices"""
-        voices = manager.clone_manager.get_available_cloned_voices()
-        if not voices:
-            return gr.HTML.update(value="<div class='alert alertWarn'>Chưa có cloned voices nào. Hãy tạo voice mới trong tab '🎭 Clone Voice'.</div>")
-        html = "<div style='display: grid; gap: 10px;'>"
-        for voice in voices:
-            html += f"""
-            <div class="statusCard" style="padding: 15px;">
-                <div style="display: flex; justify-content: space-between; align-items: center;">
-                    <div>
-                        <strong style="color: #8b5cf6;">{voice.get('display_name', voice['speaker_name'])}</strong>
-                        <div style="font-size: 0.8rem; color: #94a3b8;">
-                            Type: {voice.get('type', 'cloned')} • Quality: {voice.get('quality', 'unknown')}
-                        </div>
-                    </div>
-                    <span class="pill clone-pill">🎭 Cloned</span>
-                </div>
-            </div>
-            """
-        html += "</div>"
-        return gr.HTML.update(value=html)
-    with gr.Blocks(theme=gr.themes.Base(), css=NEON_CSS, title="CVNSS TTS với Voice Cloning") as app:
         ui_header()
         with gr.Tabs():
-            # --- TAB BASIC ---
             with gr.Tab("⚡ Chế độ Nhanh"):
                 with gr.Row():
                     with gr.Column(scale=2):
-                        with gr.Group(elem_classes=["panelNeon"], elem_id="input-panel-basic"):
                             gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
-                            txt_basic = gr.Textbox(
-                                label="",
-                                show_label=False,
-                                placeholder="Nhập nội dung tiếng Việt vào... (Ví dụ: Xin chào, bạn đã học qua CVNSS4.0 chưa?)",
-                                lines=6,
-                                elem_id="main-input-basic"
-                            )
                             with gr.Row():
                                 spk_basic = gr.Dropdown(
-                                    choices=manager.get_all_speakers(),
-                                    value=manager.tts.speakers[0] if manager.tts.speakers else "",
-                                    label="",
-                                    elem_id="spk-basic"
                                 )
-                                speed_basic = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ", elem_id="speed-basic")
-                            with gr.Row():
-                                noise_scale_basic = gr.Slider(0.1, 1.0, value=0.5, step=0.05, label="Nhiễu (noise scale)", elem_id="noise-basic")
-                                noise_scale_w_basic = gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Nhiễu W (noise scale w)", elem_id="noise-w-basic")
-                                sdp_ratio_basic = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="SDP Ratio", elem_id="sdp-basic")
-                            btn_basic = gr.Button("🔊 Tổng hợp giọng nói", variant="primary", elem_id="btn-basic")
-                        status_basic = gr.HTML(
-                            ui_status_render("", manager.tts.speakers[0] if manager.tts.speakers else "", 1.0, 0, 0, "Chờ...", False),
-                            elem_id="status-basic"
-                        )
-                    with gr.Column(scale=1):
-                        audio_basic = gr.Audio(label="Âm thanh kết quả", type="filepath", elem_id="audio-basic")
-                # Events
-                btn_basic.click(
-                    fn=process_basic,
-                    inputs=[txt_basic, spk_basic, speed_basic, noise_scale_basic, noise_scale_w_basic, sdp_ratio_basic],
-                    outputs=[audio_basic, status_basic]
-                )
-            # --- TAB VOICE CLONING ---
-            with gr.Tab("🎭 Clone Voice"):
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        with gr.Group(elem_classes=["panelNeon", "clonePanel"], elem_id="clone-panel"):
-                            gr.HTML('<div class="panelTitle" style="color: #8b5cf6;">🎭 Tạo Giọng Nói Cá Nhân</div>')
-                            with gr.Row():
-                                with gr.Column(scale=1):
-                                    speaker_name = gr.Textbox(
-                                        label="Tên giọng nói",
-                                        placeholder="vd: john_doe, my_voice, ...",
-                                        info="Tên không dấu, không ký tự đặc biệt"
-                                    )
-                                    base_speaker = gr.Dropdown(
-                                        choices=manager.tts.speakers,
-                                        value=manager.tts.speakers[0] if manager.tts.speakers else "",
-                                        label="Giọng nói cơ sở",
-                                        info="Chọn giọng gốc để fine-tune"
-                                    )
-                                with gr.Column(scale=2):
-                                    audio_files = gr.File(
-                                        label="Upload audio samples",
-                                        file_types=["audio"],
-                                        file_count="multiple",
-                                        type="filepath",
-                                        elem_id="clone-audio-upload"
-                                    )
-                                    gr.HTML("""
-                                    <div class="alert alertClone">
-                                        💡 <strong>Hướng dẫn:</strong><br/>
-                                        • Upload 3-10 file audio chất lượng tốt (định dạng WAV, MP3)<br/>
-                                        • Mỗi file dài 5-30 giây, giọng nói rõ ràng<br/>
-                                        • Tổng thời lượng ≥ 10 giây để có chất lượng tốt nhất<br/>
-                                        • File mono, sample rate 16kHz-44.1kHz
-                                    </div>
-                                    """)
-                            btn_clone = gr.Button(
-                                "🎭 Bắt đầu Clone Voice",
-                                variant="primary",
-                                elem_classes=["clone-btn"],
-                                elem_id="btn-clone-process"
-                            )
-                        clone_status = gr.HTML(
-                            ui_clone_status_render("Chờ...", 0, "Sẵn sàng tạo cloned voice"),
-                            elem_id="clone-status"
-                        )
                     with gr.Column(scale=1):
-                        with gr.Group(elem_classes=["panelNeon"], elem_id="clone-info-panel"):
-                            gr.HTML('<div class="panelTitle">📋 Cloned Voices</div>')
-                            btn_refresh = gr.Button("🔄 Làm mới danh sách", size="sm")
-                            cloned_list = gr.HTML(elem_id="cloned-voices-list")
-                # Voice Cloning Events
-                btn_clone.click(
-                    fn=process_clone_voice,
-                    inputs=[speaker_name, audio_files, base_speaker],
-                    outputs=[clone_status]
-                ).then(
-                    fn=update_speaker_dropdown,
-                    outputs=[spk_basic]
-                ).then(
-                    fn=list_cloned_voices,
-                    outputs=[cloned_list]
-                )
-                btn_refresh.click(
-                    fn=list_cloned_voices,
-                    outputs=[cloned_list]
-                )
-                # Initial load
-                app.load(
-                    fn=list_cloned_voices,
-                    outputs=[cloned_list]
-                )
-            # --- TAB ADVANCED SETTINGS ---
-            with gr.Tab("⚙️ Cài Đặt Nâng Cao"):
-                with gr.Group(elem_classes=["panelNeon"]):
-                    gr.HTML('<div class="panelTitle">⚙️ Cấu hình hệ thống</div>')
-                    with gr.Row():
-                        with gr.Column():
-                            gr.Markdown("### Voice Cloning Settings")
-                            min_duration = gr.Slider(1.0, 10.0, value=2.0, step=0.5, label="Độ dài tối thiểu mỗi file (s)")
-                            max_duration = gr.Slider(10.0, 60.0, value=30.0, step=5.0, label="Độ dài tối đa mỗi file (s)")
-                            min_total_duration = gr.Slider(5.0, 60.0, value=10.0, step=5.0, label="Tổng độ dài tối thiểu (s)")
-                        with gr.Column():
-                            gr.Markdown("### Cache Management")
-                            btn_clear_cache = gr.Button("🗑️ Xóa cache", variant="secondary")
-                            cache_info = gr.HTML("", elem_id="cache-info")
-                    def clear_cache():
-                        cache_dir = manager.temp_dir
-                        if cache_dir.exists():
-                            count = len(list(cache_dir.glob("*.wav")))
-                            shutil.rmtree(cache_dir)
-                            manager.temp_dir.mkdir(parents=True, exist_ok=True)
-                            return f"<div class='alert alertOk'>✅ Đã xóa {count} file cache</div>"
-                        return "<div class='alert alertWarn'>⚠️ Không có cache để xóa</div>"
-                    btn_clear_cache.click(
-                        fn=clear_cache,
-                        outputs=[cache_info]
-                    )
-        # Global events
-        app.load(
-            fn=update_speaker_dropdown,
-            outputs=[spk_basic]
-        )
     return app
 # =========================================================
-# 7) MAIN ENTRY POINT
 # =========================================================
-def main():
-    print("🚀 Khởi động CVNSS4.0 TTS với Voice Cloning...")
     try:
-        # Khởi tạo manager
         manager = TTSManager()
-        # Tạo UI
         app = create_ui(manager)
-        # Khởi chạy
-        print("✅ Hệ thống đã sẵn sàng!")
-        print(f"📊 Tổng số speakers: {len(manager.all_speakers)}")
-        print(f"🎭 Cloned voices: {len([s for s in manager.all_speakers if s.startswith('[CLONE]')])}")
-        print("🌐 Server đang chạy tại: http://localhost:7860")
-        return app
     except Exception as e:
-        print(f"❌ Lỗi khởi động: {e}")
-        import traceback
-        traceback.print_exc()
-        # Fallback UI nếu có lỗi
-        with gr.Blocks(css=NEON_CSS, title="CVNSS TTS - Error") as app:
-            gr.HTML(f"""
-            <div style="padding: 40px; text-align: center;">
-                <h1 style="color: #ef4444;">❌ Lỗi khởi động hệ thống</h1>
-                <div style="background: rgba(239, 68, 68, 0.1); padding: 20px; border-radius: 10px; margin: 20px 0;">
-                    <code>{str(e)}</code>
-                </div>
-                <p>Vui lòng kiểm tra log để biết thêm chi tiết.</p>
-            </div>
-            """)
-        return app
-if __name__ == "__main__":
-    app = main()
-    app.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        debug=True,
-        show_error=True
-    )

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+CVNSS4.0 Vietnamese TTS Studio (Fixed & Auto-Healing Version)
+- Fix: SyntaxError Dropdown
+- Fix: NameError SynthesizerTrn (Auto download src)
 """
 import os
 import re
 import hashlib
 import tempfile
+import subprocess
 import shutil
 from pathlib import Path
 import torch
 import numpy as np
 import soundfile as sf
 import gradio as gr
+# =========================================================
+# 0) AUTO-HEALING: DOWNLOAD MISSING CORE MODULES
+# =========================================================
+def setup_environment():
+    """Tự động tải thư mục src nếu bị thiếu"""
+    if not os.path.exists("src"):
+        print("🔄 Phát hiện thiếu thư mục 'src'. Đang tải mã nguồn cốt lõi (Core Modules)...")
+        try:
+            # Clone repo chứa src từ HuggingFace Space gốc
+            subprocess.run(
+                ["git", "clone", "https://huggingface.co/spaces/valtecAI-team/valtec-vietnamese-tts", "temp_repo"],
+                check=True
+            )
+            # Di chuyển thư mục src ra ngoài
+            if os.path.exists("temp_repo/src"):
+                shutil.move("temp_repo/src", "./src")
+                print("✅ Đã cài đặt xong 'src'.")
+            else:
+                print("❌ Không tìm thấy 'src' trong repo đã tải.")
+            # Dọn dẹp
+            shutil.rmtree("temp_repo", ignore_errors=True)
+        except Exception as e:
+            print(f"❌ Lỗi khi tải mã nguồn: {e}")
+            print("⚠️ Vui lòng kiểm tra kết nối mạng hoặc cài đặt git.")
+# Chạy setup trước khi import
+setup_environment()
+# Add src to path
 sys.path.insert(0, str(Path(__file__).parent))
+# Import core modules (Bây giờ sẽ không bị lỗi nữa)
 try:
     from src.vietnamese.text_processor import process_vietnamese_text
     from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
     from src.models.synthesizer import SynthesizerTrn
     from src.text.symbols import symbols
+    print("✅ Core modules imported successfully.")
 except ImportError as e:
+    print(f"🔥 Critical Import Error: {e}")
+    print("⚠️ Cấu trúc file vẫn chưa đúng. Hãy đảm bảo thư mục 'src' nằm cùng cấp với app.py")
+    # Define dummy classes to prevent immediate crash, allow UI to show error
     VIPHONEME_AVAILABLE = False
     symbols = []
+    SynthesizerTrn = None
 # =========================================================
+# 1) SYSTEM CONFIGURATION & CSS
 # =========================================================
 NEON_CSS = r"""
 :root {
     --bg-dark: #0f172a;
     --text-primary: #e2e8f0;
     --neon-cyan: #06b6d4;
     --neon-accent: #38bdf8;
     --radius-lg: 16px;
     --radius-sm: 8px;
     --input-bg: #f1f5f9;
     --input-text: #0f4c81;
     --input-placeholder: #64748b;
 }
 body, .gradio-container, .app {
     background: radial-gradient(circle at 50% 0%, #1e293b 0%, #0f172a 100%) !important;
     color: var(--text-primary) !important;
     font-family: 'Inter', 'Segoe UI', sans-serif;
 }
 .panelNeon {
     border: 1px solid rgba(255,255,255,0.08);
     border-radius: var(--radius-lg);
     backdrop-filter: blur(12px);
     box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
     padding: 20px;
     margin-bottom: 20px;
 }
 .panelNeon textarea, .panelNeon input[type="text"] {
     background: var(--input-bg) !important;
     color: var(--input-text) !important;
     border-radius: var(--radius-sm) !important;
     font-weight: 500 !important;
     font-size: 1rem !important;
     padding: 12px !important;
 }
 button.primary, .gr-button-primary {
     background: linear-gradient(135deg, #06b6d4 0%, #3b82f6 100%) !important;
     border: none !important;
     color: white !important;
     font-weight: 700 !important;
 }
 .statusCard {
     background: rgba(15, 23, 42, 0.6);
     border-radius: var(--radius-sm);
     padding: 16px;
     border: 1px solid rgba(255,255,255,0.05);
 }
 .pill {
     display: inline-flex;
     align-items: center;
     font-size: 0.8rem;
     font-weight: 600;
     margin-right: 6px;
 }
+.alert { padding: 12px; border-radius: 8px; margin-top: 12px; font-size: 0.9rem; }
+.alertOk { background: rgba(34, 197, 94, 0.1); color: #4ade80; border: 1px solid rgba(34, 197, 94, 0.2); }
+.alertWarn { background: rgba(234, 179, 8, 0.1); color: #facc15; border: 1px solid rgba(234, 179, 8, 0.2); }
 """
 # =========================================================
 # 2) UTILITIES & HELPERS
 # =========================================================
 def check_viphoneme():
     if not VIPHONEME_AVAILABLE:
         print("⚠️ Viphoneme not available.")
         return False
+    return True
 def md5_key(*parts: str) -> str:
     return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
 # =========================================================
+# 3) CORE ENGINE WRAPPER
 # =========================================================
 class TTSManager:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"🔧 Initializing TTS on {self.device}...")
+        # Check dependency again
+        if SynthesizerTrn is None:
+             raise ImportError("Class SynthesizerTrn chưa được định nghĩa. Kiểm tra lại thư mục src.")
         self.model_dir = self._get_model_dir()
         self.ckpt_path = find_latest_checkpoint(self.model_dir, "G")
         self.cfg_path = os.path.join(self.model_dir, "config.json")
             raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
         self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
         self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
         self.temp_dir.mkdir(parents=True, exist_ok=True)
     def _get_model_dir(self):
         return download_model()
     def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
         try:
             if not text or not text.strip():
                 return None, "⚠️ Empty input"
             key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
             out_path = self.temp_dir / f"{key}.wav"
             if out_path.exists():
                 return str(out_path), "✅ Cached (From history)"
+            audio, sr = self.tts.synthesize(
+                text=text, speaker=speaker, length_scale=speed,
+                noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
+            )
+            sf.write(str(out_path), audio, sr)
+            return str(out_path), "✅ Generated successfully"
         except Exception as e:
             return None, f"❌ Error: {str(e)}"
 # =========================================================
+# 4) MODEL LOGIC
 # =========================================================
 def find_latest_checkpoint(model_dir, prefix="G"):
     pattern = os.path.join(model_dir, f"{prefix}*.pth")
     checkpoints = glob.glob(pattern)
         self.model.eval()
     def synthesize(self, text, speaker, **kwargs):
+        from src.text import cleaned_text_to_sequence
+        from src.nn import commons
         norm_text = process_vietnamese_text(text)
         phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
         phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
         tone_ids = commons.intersperse(tone_ids, 0)
         lang_ids = commons.intersperse(lang_ids, 0)
         x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device)
         x_len = torch.LongTensor([len(phone_ids)]).to(self.device)
         tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device)
         lang = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device)
         sid = torch.LongTensor([self.spk2id.get(speaker, 0)]).to(self.device)
         with torch.no_grad():
             bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device)
             ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device)
+            outputs = self.model.infer(x, x_len, sid, tone, lang, bert, ja_bert, **kwargs)
             audio = outputs[0][0,0].detach().cpu().numpy()
         return audio, self.config["data"]["sampling_rate"]
 # =========================================================
+# 5) UI CONSTRUCTION
 # =========================================================
 def create_ui(manager: TTSManager):
     def ui_header():
         return gr.HTML("""
         <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
+            <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem;">🎛️ CVNSS4.0 TTS Studio</h1>
+            <div style="color: #94a3b8; font-size: 1rem;">Fix: Auto-Healing Source • Expert Mode</div>
         </div>
         """)
+    def ui_status_render(text, speaker, speed, dur, msg):
         return f"""
         <div class="statusCard">
             <div style="display:flex; flex-wrap:wrap; gap:8px;">
+                <span class="pill">🎤 {speaker}</span>
                 <span class="pill">⚡ {speed}x</span>
+                <span class="pill">⏱️ {dur:.2f}s</span>
             </div>
+            <div class="alert {'alertOk' if '✅' in msg else 'alertWarn'}">{msg}</div>
         </div>
         """
+    def run_inference(text, speaker, speed):
+        start_t = time.time()
+        audio_path, msg = manager.synthesize(text, speaker, speed, 0.667, 0.8, 0.2)
+        duration = time.time() - start_t
+        html_status = ui_status_render(text, speaker, speed, duration, msg)
+        return audio_path, html_status
+    with gr.Blocks(css=NEON_CSS, title="Neon TTS Expert") as app:
         ui_header()
         with gr.Tabs():
             with gr.Tab("⚡ Chế độ Nhanh"):
                 with gr.Row():
                     with gr.Column(scale=2):
+                        with gr.Group(elem_classes=["panelNeon"]):
                             gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
+                            txt_basic = gr.Textbox(show_label=False, lines=5, placeholder="Nhập văn bản tiếng Việt...", value="Xin chào, hệ thống đã tự động sửa lỗi thiếu file nguồn.")
                             with gr.Row():
+                                # === FIX DROPDOWN HERE ===
                                 spk_basic = gr.Dropdown(
+                                    choices=manager.tts.speakers,
+                                    value=manager.tts.speakers[0] if manager.tts.speakers else None,
+                                    label="Người đọc",
+                                    interactive=True,
+                                    scale=2
                                 )
+                                speed_basic = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Tốc độ", scale=2)
+                            btn_basic = gr.Button("🔊 Đọc Ngay", variant="primary")
                     with gr.Column(scale=1):
+                         with gr.Group(elem_classes=["panelNeon"]):
+                            gr.HTML('<div class="panelTitle">🎧 Kết quả</div>')
+                            out_audio = gr.Audio(label="Audio Output", type="filepath")
+                            out_status = gr.HTML()
+                btn_basic.click(run_inference, [txt_basic, spk_basic, speed_basic], [out_audio, out_status])
     return app
 # =========================================================
+# 6) MAIN EXECUTION
 # =========================================================
+if __name__ == "__main__":
     try:
         manager = TTSManager()
+        check_viphoneme()
         app = create_ui(manager)
+        print("🚀 Launching App...")
+        app.launch()
     except Exception as e:
+        print(f"🔥 Critical Start Error: {e}")