CVNSS commited on
Commit
0bc2dfc
·
verified ·
1 Parent(s): 320f794

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +711 -256
app.py CHANGED
@@ -1,340 +1,795 @@
1
  #!/usr/bin/env python3
 
2
  # -*- coding: utf-8 -*-
3
 
 
 
4
  """
5
- 💎 CVNSS4.0 Vietnamese TTS Studio - Final Repair Edition
6
- - Fix: 'No module named imp' (Python 3.12+ Compatibility Patch)
7
- - Fix: Auto-download 'src' folder if missing
8
- - Fix: Auto-download Model checkpoints
9
- - Design: Azure Horizon (Ceramic White & Soft Blue)
10
- - Author: Refactored by 100-Year AI Expert
 
 
 
11
  """
12
 
 
 
13
  import os
 
14
  import sys
 
15
  import json
 
16
  import time
 
 
 
17
  import re
18
- import logging
 
 
19
  import tempfile
20
- import importlib
21
- import types
22
  from pathlib import Path
23
- from typing import Optional, List
24
 
25
- # --- 0. CRITICAL PATCH: FIX 'No module named imp' ---
26
- # Mã nguồn cũ dùng 'imp' (đã bị xóa ở Python 3.12). Ta tạo module giả để đánh lừa nó.
27
- try:
28
- import imp
29
- except ImportError:
30
- import types
31
- # Tạo module giả
32
- imp = types.ModuleType("imp")
33
- # Map các hàm quan trọng từ importlib sang imp
34
- imp.new_module = types.ModuleType
35
- imp.reload = importlib.reload
36
- sys.modules["imp"] = imp
37
- print("🔧 Đã vá lỗi 'imp' module cho Python 3.12+")
38
 
39
  import torch
 
40
  import numpy as np
 
41
  import soundfile as sf
 
42
  import gradio as gr
43
- from huggingface_hub import hf_hub_download, snapshot_download
44
 
45
- # --- 1. SETUP LOGGING & PATH ---
46
- logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
47
- logger = logging.getLogger("CVNSS_Studio")
48
 
49
- ROOT_DIR = Path(__file__).resolve().parent
50
- if str(ROOT_DIR) not in sys.path:
51
- sys.path.insert(0, str(ROOT_DIR))
52
 
53
- # --- 2. AUTO-HEALING: TẢI SOURCE CODE NẾU THIẾU ---
54
- def ensure_source_code():
55
- src_path = ROOT_DIR / "src"
56
- if not src_path.exists():
57
- logger.warning("⚠️ Không thấy thư mục 'src'. Đang tự động tải từ kho gốc Valtec...")
58
- try:
59
- # Tải folder src từ repo gốc về thư mục hiện tại
60
- snapshot_download(
61
- repo_id="valtecAI-team/valtec-vietnamese-tts",
62
- repo_type="space", # Repo gốc là Space
63
- allow_patterns=["src/*", "src/**/*"],
64
- local_dir=str(ROOT_DIR),
65
- token=None # Public repo không cần token
66
- )
67
- logger.info("✅ Đã tải xong mã nguồn 'src'.")
68
- except Exception as e:
69
- logger.error(f"❌ Không thể tải mã nguồn: {e}")
70
- raise RuntimeError("Không thể tải 'src'. Vui lòng kiểm tra kết nối mạng.")
71
 
72
- # Chạy hàm kiểm tra ngay lập tức
73
- try:
74
- ensure_source_code()
75
- except Exception:
76
- pass # Sẽ xử lý lỗi ở phần import bên dưới
77
 
78
- # --- 3. IMPORT CORE MODULES ---
79
  try:
 
80
  from src.vietnamese.text_processor import process_vietnamese_text
 
81
  from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
 
82
  from src.models.synthesizer import SynthesizerTrn
 
83
  from src.text.symbols import symbols
84
- from src.text import cleaned_text_to_sequence
85
- from src.nn import commons
86
- CORE_LOADED = True
87
- IMPORT_ERROR_MSG = ""
88
- except ImportError as e:
89
- logger.error(f" Lỗi Import Core: {e}")
90
- CORE_LOADED = False
91
- IMPORT_ERROR_MSG = str(e)
92
  VIPHONEME_AVAILABLE = False
 
93
  symbols = []
94
 
 
 
 
 
95
  # =========================================================
96
- # 4. ELEGANT CSS (AZURE HORIZON)
 
 
97
  # =========================================================
98
- ELEGANT_CSS = r"""
99
- @import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;600;800&family=Roboto+Mono:wght@400;500&display=swap');
 
 
 
 
100
 
101
  :root {
102
- --primary-blue: #3b82f6;
103
- --text-dark: #1e293b;
104
- --surface-white: #ffffff;
105
- --bg-gradient: linear-gradient(135deg, #f8fafc 0%, #e0f2fe 100%);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  }
107
 
108
- body, .gradio-container {
109
- background: var(--bg-gradient) !important;
110
- font-family: 'Manrope', sans-serif !important;
111
- color: var(--text-dark) !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  }
113
 
114
- .elegant-card {
115
- background: var(--surface-white);
116
- border-radius: 20px;
117
- border: 1px solid rgba(255, 255, 255, 0.8);
118
- box-shadow: 0 10px 30px -10px rgba(59, 130, 246, 0.15);
119
- padding: 24px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  }
121
 
122
- .header-title {
123
- font-weight: 800; font-size: 2rem; color: #0f172a;
124
- letter-spacing: -0.03em;
 
 
 
125
  }
126
 
127
- button.primary-btn {
128
- background: var(--primary-blue) !important;
129
- color: white !important;
130
- border-radius: 12px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  border: none !important;
132
- font-weight: 600 !important;
133
- padding: 10px 20px;
134
- transition: 0.2s !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  }
136
- button.primary-btn:hover { transform: translateY(-2px); box-shadow: 0 10px 20px -5px rgba(59, 130, 246, 0.4); }
137
 
138
- .badge { display: inline-flex; align-items: center; padding: 4px 12px; border-radius: 99px; font-size: 0.8rem; font-weight: 600; margin-right: 5px;}
139
- .badge-success { background: #dcfce7; color: #15803d; }
140
- .badge-error { background: #fee2e2; color: #b91c1c; }
 
 
 
141
  """
142
 
 
 
143
  # =========================================================
144
- # 5. UTILITIES
 
 
145
  # =========================================================
146
- def split_text_smart(text: str, max_chars: int = 300) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  if not text: return []
 
148
  text = re.sub(r'\s+', ' ', text).strip()
149
- raw = re.split(r'([.?!;:])', text)
150
- sentences = []
151
- current = ""
152
- for part in raw:
153
- if part in ".?!;:":
154
- current += part
155
- sentences.append(current.strip())
156
- current = ""
157
- else:
158
- current += part
159
- if current: sentences.append(current.strip())
160
 
 
161
  chunks = []
162
- chunk = ""
163
- for sent in sentences:
164
- if len(chunk) + len(sent) < max_chars:
165
- chunk += " " + sent if chunk else sent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  else:
167
- if chunk: chunks.append(chunk)
168
- chunk = sent
169
- if chunk: chunks.append(chunk)
170
- return chunks if chunks else [text]
 
 
 
 
 
 
 
 
171
 
172
  # =========================================================
173
- # 6. ENGINE CORE
 
 
174
  # =========================================================
 
175
  class TTSManager:
 
 
 
176
  def __init__(self):
177
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
178
- self.net_g = None
179
- self.hps = None
180
- self.ready = False
181
- self.status_msg = "Đang khởi tạo..."
182
- self._initialize_model()
183
-
184
- def _initialize_model(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  try:
186
- # Tự động tải model về thư mục riêng
187
- model_dir = ROOT_DIR / "model_cache"
188
- model_dir.mkdir(exist_ok=True)
189
-
190
- # Repo chứa model
191
- repo_id = "valtecAI-team/valtec-vietnamese-tts"
192
-
193
- logger.info("⬇️ Đang tải Config & Model...")
194
- cfg_path = hf_hub_download(repo_id=repo_id, filename="config.json", local_dir=model_dir)
195
-
196
- # Tìm file G_*.pth mới nhất hoặc mặc định
197
- try:
198
- ckpt_path = hf_hub_download(repo_id=repo_id, filename="G_100000.pth", local_dir=model_dir)
199
- except:
200
- # Fallback nếu tên file khác
201
- ckpt_path = hf_hub_download(repo_id=repo_id, filename="G_0.pth", local_dir=model_dir)
202
-
203
- with open(cfg_path, "r", encoding="utf-8") as f:
204
- self.hps = json.load(f)
205
 
206
- self.spk2id = self.hps["data"]["spk2id"]
207
- self.speakers = sorted(list(self.spk2id.keys()))
208
-
209
- if CORE_LOADED:
210
- self.net_g = SynthesizerTrn(
211
- len(symbols),
212
- self.hps["data"]["filter_length"] // 2 + 1,
213
- self.hps["train"]["segment_size"] // self.hps["data"]["hop_length"],
214
- n_speakers=self.hps["data"]["n_speakers"],
215
- **self.hps["model"]
216
- ).to(self.device)
217
-
218
- ckpt = torch.load(ckpt_path, map_location=self.device)
219
- self.net_g.load_state_dict(ckpt['model'])
220
- self.net_g.eval()
221
- self.ready = True
222
- self.status_msg = f"✅ Sẵn sàng ({self.device})"
223
- logger.info("🚀 Engine đã khởi động thành công!")
224
- else:
225
- self.status_msg = "❌ Lỗi: Không load được mã nguồn (src)"
226
 
227
  except Exception as e:
228
- self.ready = False
229
- self.status_msg = f"❌ Lỗi Model: {str(e)}"
230
- logger.error(self.status_msg)
231
 
232
- def infer(self, text, spk, speed, ns, nsw, sdp):
233
- if not self.ready: raise RuntimeError(self.status_msg)
234
- if self.device.type == 'cuda': torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- text_norm = process_vietnamese_text(text)
237
- phones, tones, _ = text_to_phonemes(text_norm, use_viphoneme=VIPHONEME_AVAILABLE)
238
  phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  with torch.no_grad():
241
- x = torch.LongTensor(commons.intersperse(phone_ids, 0)).unsqueeze(0).to(self.device)
242
- x_len = torch.LongTensor([x.size(1)]).to(self.device)
243
- tone = torch.LongTensor(commons.intersperse(tone_ids, 0)).unsqueeze(0).to(self.device)
244
- lang = torch.LongTensor(commons.intersperse(lang_ids, 0)).unsqueeze(0).to(self.device)
245
- sid = torch.LongTensor([self.spk2id.get(spk, 0)]).to(self.device)
246
 
247
- outputs = self.net_g.infer(x, x_len, sid, tone, lang, noise_scale=ns, noise_scale_w=nsw, length_scale=speed, sdp_ratio=sdp)
248
- return outputs[0][0, 0].data.cpu().float().numpy(), self.hps["data"]["sampling_rate"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  # =========================================================
251
- # 7. UI CONSTRUCTION
 
 
252
  # =========================================================
253
- def build_interface():
254
- manager = TTSManager()
255
-
256
- def run_inference(text, spk, speed, ns, nsw, sdp, is_long, chunk_size, pause, progress=gr.Progress()):
257
- if not manager.ready:
258
- # Thử load lại nếu trước đó thất bại
259
- if not CORE_LOADED:
260
- return None, f"<span class='badge badge-error'>Lỗi mã nguồn: {IMPORT_ERROR_MSG}</span>"
261
- return None, f"<span class='badge badge-error'>{manager.status_msg}</span>"
262
-
263
- if not text: return None, "⚠️ Chưa nhập nội dung"
264
 
265
- try:
266
- full_audio = None
267
- sr = 0
268
- if not is_long:
269
- full_audio, sr = manager.infer(text, spk, speed, ns, nsw, sdp)
270
- else:
271
- chunks = split_text_smart(text, chunk_size)
272
- segments = []
273
- sr = 22050
274
- for i, chunk in enumerate(chunks):
275
- progress((i)/len(chunks), desc=f"Đoạn {i+1}/{len(chunks)}")
276
- a, r = manager.infer(chunk, spk, speed, ns, nsw, sdp)
277
- sr = r
278
- segments.append(a)
279
- if pause > 0: segments.append(np.zeros(int(sr * pause / 1000)))
280
- if segments: full_audio = np.concatenate(segments)
281
-
282
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
283
- sf.write(fp.name, full_audio, sr)
284
- dur = len(full_audio)/sr
285
- return fp.name, f"<span class='badge badge-success'>Hoàn thành: {dur:.1f}s</span>"
286
- except Exception as e:
287
- return None, f"<span class='badge badge-error'>Lỗi: {str(e)}</span>"
288
 
289
- speaker_list = manager.speakers if manager.ready else ["Đang tải..."]
290
 
291
- with gr.Blocks(theme=gr.themes.Soft(), css=ELEGANT_CSS, title="CVNSS4.0 Studio") as app:
292
- gr.HTML(f"""
293
- <div style="margin-bottom: 20px;">
294
- <div class="header-title">CVNSS4.0 Studio</div>
295
- <div style="color: #64748b;">Long Ngo • Trần Tư Bình • Valtec TTS Core</div>
296
- <div style="margin-top:5px">Trạng thái: <b>{manager.status_msg}</b></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  </div>
 
298
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
- if not CORE_LOADED:
301
- gr.HTML(f"""<div style="background:#fee2e2; color:#b91c1c; padding:10px; border-radius:8px;">
302
- <b>LỖI NGHIÊM TRỌNG:</b> Không tải được mã nguồn 'src'.<br>
303
- Hệ thống đã cố gắng tự tải nhưng thất bại. Chi tiết: {IMPORT_ERROR_MSG}
304
- </div>""")
305
 
306
  with gr.Tabs():
 
 
 
307
  with gr.Tab("⚡ Chế độ Nhanh"):
 
308
  with gr.Row():
309
- with gr.Column(scale=3, elem_classes="elegant-card"):
310
- txt_in = gr.Textbox(label="Văn bản", placeholder="Nhập văn bản tiếng Việt...", lines=3)
311
- with gr.Row():
312
- spk_drp = gr.Dropdown(speaker_list, value=speaker_list[0] if speaker_list else None, label="Giọng đọc")
313
- spd_sld = gr.Slider(0.5, 2.0, 1.0, label="Tốc độ")
314
- btn_run = gr.Button("🔊 Đọc Ngay", elem_classes="primary-btn")
315
- with gr.Column(scale=2, elem_classes="elegant-card"):
316
- out_aud = gr.Audio(label="Kết quả", type="filepath")
317
- out_html = gr.HTML()
318
- btn_run.click(lambda t,s,sp: run_inference(t,s,sp,0.667,0.8,0.2,False,0,0), [txt_in,spk_drp,spd_sld], [out_aud,out_html])
319
-
320
- with gr.Tab("💎 Chế độ Dài"):
321
- with gr.Row():
322
- with gr.Column(scale=3, elem_classes="elegant-card"):
323
- txt_long = gr.Textbox(label="Văn bản dài", lines=6)
324
- with gr.Accordion("Cấu hình", open=False):
325
- ns = gr.Slider(0.1, 1.5, 0.667, label="Noise Scale")
326
- nsw = gr.Slider(0.1, 1.5, 0.8, label="Noise Width")
327
- sdp = gr.Slider(0, 1, 0.2, label="SDP")
328
- chunk = gr.Slider(100, 1000, 300, label="Ngắt câu")
329
- pause = gr.Slider(0, 1000, 250, label="Nghỉ (ms)")
330
- btn_long = gr.Button("🚀 Xử lý", elem_classes="primary-btn")
331
- with gr.Column(scale=2, elem_classes="elegant-card"):
332
- out_long = gr.Audio(label="Audio", type="filepath")
333
- out_html_long = gr.HTML()
334
- btn_long.click(lambda t,s,sp,n,nw,sd,c,p: run_inference(t,s,sp,n,nw,sd,True,c,p), [txt_long,spk_drp,spd_sld,ns,nsw,sdp,chunk,pause], [out_long,out_html_long])
335
-
336
- return app
337
-
338
- if __name__ == "__main__":
339
- ui = build_interface()
340
- ui.queue().launch(server_name="0.0.0.0", show_error=True)
 
 
 
1
  #!/usr/bin/env python3
2
+
3
  # -*- coding: utf-8 -*-
4
 
5
+
6
+
7
  """
8
+
9
+ CVNSS4.0 Vietnamese TTS Studio
10
+
11
+ - Architecture: Modular CSS & Component Separation
12
+
13
+ - UX: High Contrast Input Fields
14
+
15
+ - Core: Optimized Logic Flow
16
+
17
  """
18
 
19
+
20
+
21
  import os
22
+
23
  import sys
24
+
25
  import json
26
+
27
  import time
28
+
29
+ import glob
30
+
31
  import re
32
+
33
+ import hashlib
34
+
35
  import tempfile
36
+
 
37
  from pathlib import Path
 
38
 
39
+
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  import torch
42
+
43
  import numpy as np
44
+
45
  import soundfile as sf
46
+
47
  import gradio as gr
 
48
 
 
 
 
49
 
 
 
 
50
 
51
+ # Add src to path for imports
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ sys.path.insert(0, str(Path(__file__).parent))
54
+
55
+
56
+
57
+ # Import core modules
58
 
 
59
  try:
60
+
61
  from src.vietnamese.text_processor import process_vietnamese_text
62
+
63
  from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
64
+
65
  from src.models.synthesizer import SynthesizerTrn
66
+
67
  from src.text.symbols import symbols
68
+
69
+ except ImportError:
70
+
71
+ # Fallback for environment setup if src is missing during init
72
+
73
+ print("⚠️ Core modules not found. Ensure 'src' directory exists.")
74
+
 
75
  VIPHONEME_AVAILABLE = False
76
+
77
  symbols = []
78
 
79
+
80
+
81
+
82
+
83
  # =========================================================
84
+
85
+ # 1) SYSTEM CONFIGURATION & CSS (The Expert Layer)
86
+
87
  # =========================================================
88
+
89
+
90
+
91
+ # Expert CSS: Definitive Z-Index Management & Neon Theme
92
+
93
+ NEON_CSS = r"""
94
 
95
  :root {
96
+
97
+ --bg-dark: #0f172a;
98
+
99
+ --bg-panel: rgba(30, 41, 59, 0.7);
100
+
101
+ --line: rgba(148, 163, 184, 0.1);
102
+
103
+ --text-primary: #e2e8f0;
104
+
105
+ --neon-cyan: #06b6d4;
106
+
107
+ --neon-accent: #38bdf8;
108
+
109
+ --radius-lg: 16px;
110
+
111
+ --radius-sm: 8px;
112
+
113
+
114
+
115
+ /* UX Color Palette for Inputs */
116
+
117
+ --input-bg: #f1f5f9; /* Light Blue-Grey for readability */
118
+
119
+ --input-text: #0f4c81; /* Classic Blue (Dark Blue) for high contrast */
120
+
121
+ --input-placeholder: #64748b;
122
+
123
+ }
124
+
125
+
126
+
127
+ body, .gradio-container, .app {
128
+
129
+ background: radial-gradient(circle at 50% 0%, #1e293b 0%, #0f172a 100%) !important;
130
+
131
+ color: var(--text-primary) !important;
132
+
133
+ font-family: 'Inter', 'Segoe UI', sans-serif;
134
+
135
  }
136
 
137
+
138
+
139
+ /* --- ISOLATION FULL: CVNSS4.0 Vietnamese TTS Studio --- */
140
+
141
+ .panelNeon {
142
+
143
+ border: 1px solid rgba(255,255,255,0.08);
144
+
145
+ border-radius: var(--radius-lg);
146
+
147
+ background: var(--bg-panel);
148
+
149
+ backdrop-filter: blur(12px);
150
+
151
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
152
+
153
+ padding: 20px;
154
+
155
+ position: relative;
156
+
157
+ isolation: isolate;
158
+
159
+ z-index: 1;
160
+
161
+ margin-bottom: 20px;
162
+
163
  }
164
 
165
+
166
+
167
+ /* UX IMPROVEMENT: High Contrast Input Styling */
168
+
169
+ .panelNeon textarea, .panelNeon input[type="text"] {
170
+
171
+ background: var(--input-bg) !important;
172
+
173
+ color: var(--input-text) !important; /* DARK BLUE TEXT requested */
174
+
175
+ border: 2px solid transparent !important;
176
+
177
+ border-radius: var(--radius-sm) !important;
178
+
179
+ font-weight: 500 !important;
180
+
181
+ font-size: 1rem !important;
182
+
183
+ line-height: 1.5 !important;
184
+
185
+ padding: 12px !important;
186
+
187
+ transition: all 0.2s ease;
188
+
189
+ z-index: 10 !important;
190
+
191
+ position: relative !important;
192
+
193
  }
194
 
195
+
196
+
197
+ .panelNeon textarea::placeholder {
198
+
199
+ color: var(--input-placeholder) !important;
200
+
201
  }
202
 
203
+
204
+
205
+ .panelNeon textarea:focus, .panelNeon input:focus {
206
+
207
+ background: #ffffff !important;
208
+
209
+ border-color: var(--neon-cyan) !important;
210
+
211
+ box-shadow: 0 0 0 4px rgba(6, 182, 212, 0.15) !important;
212
+
213
+ color: #000000 !important; /* Even darker on focus */
214
+
215
+ }
216
+
217
+
218
+
219
+ /* Label Styling */
220
+
221
+ .panelNeon label span {
222
+
223
+ color: var(--neon-accent) !important;
224
+
225
+ font-weight: 600;
226
+
227
+ font-size: 0.85rem;
228
+
229
+ text-transform: uppercase;
230
+
231
+ letter-spacing: 0.05em;
232
+
233
+ margin-bottom: 8px;
234
+
235
+ display: block;
236
+
237
+ }
238
+
239
+
240
+
241
+ /* Dropdown & Slider fixes */
242
+
243
+ .panelNeon .wrap, .panelNeon .range-compact {
244
+
245
+ z-index: 10 !important;
246
+
247
+ }
248
+
249
+
250
+
251
+ /* Button Upgrades */
252
+
253
+ button.primary, .gr-button-primary {
254
+
255
+ background: linear-gradient(135deg, #06b6d4 0%, #3b82f6 100%) !important;
256
+
257
  border: none !important;
258
+
259
+ color: white !important;
260
+
261
+ font-weight: 700 !important;
262
+
263
+ transition: transform 0.1s ease, box-shadow 0.2s ease;
264
+
265
+ }
266
+
267
+ button.primary:hover, .gr-button-primary:hover {
268
+
269
+ box-shadow: 0 10px 15px -3px rgba(6, 182, 212, 0.3) !important;
270
+
271
+ transform: translateY(-1px);
272
+
273
+ }
274
+
275
+ button.primary:active {
276
+
277
+ transform: translateY(0px);
278
+
279
+ }
280
+
281
+
282
+
283
+ /* Status Panel */
284
+
285
+ .statusCard {
286
+
287
+ background: rgba(15, 23, 42, 0.6);
288
+
289
+ border-radius: var(--radius-sm);
290
+
291
+ padding: 16px;
292
+
293
+ border: 1px solid rgba(255,255,255,0.05);
294
+
295
+ }
296
+
297
+ .pill {
298
+
299
+ display: inline-flex;
300
+
301
+ align-items: center;
302
+
303
+ padding: 4px 12px;
304
+
305
+ border-radius: 99px;
306
+
307
+ background: rgba(56, 189, 248, 0.1);
308
+
309
+ color: #38bdf8;
310
+
311
+ border: 1px solid rgba(56, 189, 248, 0.2);
312
+
313
+ font-size: 0.8rem;
314
+
315
+ font-weight: 600;
316
+
317
+ margin-right: 6px;
318
+
319
+ margin-bottom: 6px;
320
+
321
  }
 
322
 
323
+ .alert { padding: 12px; border-radius: 8px; margin-top: 12px; font-size: 0.9rem; font-weight: 500; display: flex; align-items: center; gap: 8px;}
324
+
325
+ .alertOk { background: rgba(34, 197, 94, 0.1); color: #4ade80; border: 1px solid rgba(34, 197, 94, 0.2); }
326
+
327
+ .alertWarn { background: rgba(234, 179, 8, 0.1); color: #facc15; border: 1px solid rgba(234, 179, 8, 0.2); }
328
+
329
  """
330
 
331
+
332
+
333
  # =========================================================
334
+
335
+ # 2) UTILITIES & HELPERS
336
+
337
  # =========================================================
338
+
339
+ def check_viphoneme():
340
+
341
+ if not VIPHONEME_AVAILABLE:
342
+
343
+ print("⚠️ Viphoneme not available.")
344
+
345
+ return False
346
+
347
+ try:
348
+
349
+ phones, _, _ = text_to_phonemes("Test", use_viphoneme=True)
350
+
351
+ print("✅ Viphoneme active.")
352
+
353
+ return True
354
+
355
+ except Exception as e:
356
+
357
+ print(f"❌ Viphoneme error: {e}")
358
+
359
+ return False
360
+
361
+
362
+
363
+ def md5_key(*parts: str) -> str:
364
+
365
+ return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
366
+
367
+
368
+
369
+ def split_sentences_vi(text: str, max_chars: int):
370
+
371
+ # Improved splitting logic
372
+
373
  if not text: return []
374
+
375
  text = re.sub(r'\s+', ' ', text).strip()
376
+
377
+ # Split by delimiters keeping delimiters
378
+
379
+ parts = re.split(r'([.?!;:])', text)
380
+
 
 
 
 
 
 
381
 
382
+
383
  chunks = []
384
+
385
+ current_chunk = ""
386
+
387
+
388
+
389
+ for i in range(0, len(parts) - 1, 2):
390
+
391
+ sentence = parts[i] + parts[i+1]
392
+
393
+ if len(current_chunk) + len(sentence) <= max_chars:
394
+
395
+ current_chunk += sentence
396
+
397
+ else:
398
+
399
+ if current_chunk: chunks.append(current_chunk.strip())
400
+
401
+ current_chunk = sentence
402
+
403
+
404
+
405
+ if len(parts) % 2 != 0 and parts[-1]:
406
+
407
+ sentence = parts[-1]
408
+
409
+ if len(current_chunk) + len(sentence) <= max_chars:
410
+
411
+ current_chunk += sentence
412
+
413
  else:
414
+
415
+ if current_chunk: chunks.append(current_chunk.strip())
416
+
417
+ current_chunk = sentence
418
+
419
+
420
+
421
+ if current_chunk: chunks.append(current_chunk.strip())
422
+
423
+ return chunks
424
+
425
+
426
 
427
  # =========================================================
428
+
429
+ # 3) CORE ENGINE WRAPPER
430
+
431
  # =========================================================
432
+
433
  class TTSManager:
434
+
435
+ """Singleton-like manager for TTS operations."""
436
+
437
  def __init__(self):
438
+
439
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
440
+
441
+ print(f"🔧 Initializing TTS on {self.device}...")
442
+
443
+
444
+
445
+ self.model_dir = self._get_model_dir()
446
+
447
+ self.ckpt_path = find_latest_checkpoint(self.model_dir, "G")
448
+
449
+ self.cfg_path = os.path.join(self.model_dir, "config.json")
450
+
451
+
452
+
453
+ if not self.ckpt_path:
454
+
455
+ raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
456
+
457
+
458
+
459
+ self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
460
+
461
+ self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
462
+
463
+ self.temp_dir.mkdir(parents=True, exist_ok=True)
464
+
465
+
466
+
467
+ def _get_model_dir(self):
468
+
469
+ return download_model()
470
+
471
+
472
+
473
+ def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
474
+
475
  try:
476
+
477
+ if not text or not text.strip():
478
+
479
+ return None, "⚠️ Empty input"
480
+
481
+
482
+
483
+ key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
484
+
485
+ out_path = self.temp_dir / f"{key}.wav"
486
+
 
 
 
 
 
 
 
 
487
 
488
+
489
+ if out_path.exists():
490
+
491
+ return str(out_path), "✅ Cached (From history)"
492
+
493
+
494
+
495
+ audio, sr = self.tts.synthesize(
496
+
497
+ text=text, speaker=speaker, length_scale=speed,
498
+
499
+ noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
500
+
501
+ )
502
+
503
+ sf.write(str(out_path), audio, sr)
504
+
505
+ return str(out_path), " Generated successfully"
 
 
506
 
507
  except Exception as e:
 
 
 
508
 
509
+ # Capture full traceback if needed, but return clean msg
510
+
511
+ return None, f"❌ Error: {str(e)}"
512
+
513
+
514
+
515
+ # =========================================================
516
+
517
+ # 4) MODEL LOGIC (PRESERVED & FIXED)
518
+
519
+ # =========================================================
520
+
521
+ def find_latest_checkpoint(model_dir, prefix="G"):
522
+
523
+ pattern = os.path.join(model_dir, f"{prefix}*.pth")
524
+
525
+ checkpoints = glob.glob(pattern)
526
+
527
+ if not checkpoints: return None
528
+
529
+ checkpoints.sort(key=lambda x: int(re.search(rf"{prefix}(\d+)\.pth", x).group(1)) if re.search(rf"{prefix}(\d+)\.pth", x) else 0, reverse=True)
530
+
531
+ return checkpoints[0]
532
+
533
+
534
+
535
+ def download_model():
536
+
537
+ from huggingface_hub import snapshot_download
538
+
539
+ hf_repo = "valtecAI-team/valtec-tts-pretrained"
540
+
541
+ cache_base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
542
+
543
+ if os.name == "nt": cache_base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local"))
544
+
545
+
546
+
547
+ model_dir = cache_base / "valtec_tts" / "models" / "vits-vietnamese"
548
+
549
+ if (model_dir / "config.json").exists() and list(model_dir.glob("G_*.pth")):
550
+
551
+ return str(model_dir)
552
+
553
+
554
+
555
+ print(f"⬇️ Downloading {hf_repo}...")
556
+
557
+ snapshot_download(repo_id=hf_repo, local_dir=str(model_dir))
558
+
559
+ return str(model_dir)
560
+
561
+
562
+
563
+ class VietnameseTTS:
564
+
565
+ def __init__(self, ckpt, cfg, device="cpu"):
566
+
567
+ self.device = device
568
+
569
+ with open(cfg, "r", encoding="utf-8") as f: self.config = json.load(f)
570
+
571
+ self.spk2id = self.config["data"]["spk2id"]
572
+
573
+ self.speakers = list(self.spk2id.keys())
574
+
575
+ self._load(ckpt)
576
+
577
+
578
+
579
+ def _load(self, ckpt):
580
+
581
+ self.model = SynthesizerTrn(
582
+
583
+ len(symbols),
584
+
585
+ self.config["data"]["filter_length"] // 2 + 1,
586
+
587
+ self.config["train"]["segment_size"] // self.config["data"]["hop_length"],
588
+
589
+ n_speakers=self.config["data"]["n_speakers"],
590
+
591
+ **self.config["model"]
592
+
593
+ ).to(self.device)
594
+
595
+ state = torch.load(ckpt, map_location=self.device)["model"]
596
+
597
+ self.model.load_state_dict({k.replace("module.", ""): v for k,v in state.items()}, strict=False)
598
+
599
+ self.model.eval()
600
+
601
+
602
+
603
+ def synthesize(self, text, speaker, **kwargs):
604
+
605
+ from src.text import cleaned_text_to_sequence
606
+
607
+ from src.nn import commons
608
+
609
+
610
+
611
+ # 1. Text Processing
612
+
613
+ norm_text = process_vietnamese_text(text)
614
+
615
+ phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
616
 
 
 
617
  phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
618
 
619
+
620
+
621
+ phone_ids = commons.intersperse(phone_ids, 0)
622
+
623
+ tone_ids = commons.intersperse(tone_ids, 0)
624
+
625
+ lang_ids = commons.intersperse(lang_ids, 0)
626
+
627
+
628
+
629
+ # 2. Prepare Tensors
630
+
631
+ x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device)
632
+
633
+ x_len = torch.LongTensor([len(phone_ids)]).to(self.device)
634
+
635
+ tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device)
636
+
637
+ lang = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device)
638
+
639
+ sid = torch.LongTensor([self.spk2id.get(speaker, 0)]).to(self.device)
640
+
641
+
642
+
643
+ # 3. Inference with Gradient Safety (FIX IS HERE)
644
+
645
  with torch.no_grad():
 
 
 
 
 
646
 
647
+ bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device)
648
+
649
+ ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device)
650
+
651
+
652
+
653
+ # Run inference
654
+
655
+ # The error "Can't call numpy() on Tensor that requires grad" means output has grad_fn.
656
+
657
+ # We use .detach() before .cpu() to ensure the graph is cut.
658
+
659
+ outputs = self.model.infer(
660
+
661
+ x, x_len, sid, tone, lang,
662
+
663
+ bert, ja_bert,
664
+
665
+ **kwargs
666
+
667
+ )
668
+
669
+
670
+
671
+ audio = outputs[0][0,0].detach().cpu().numpy()
672
+
673
+
674
+
675
+ return audio, self.config["data"]["sampling_rate"]
676
+
677
+
678
 
679
  # =========================================================
680
+
681
+ # 5) UI CONSTRUCTION (REFACTORED)
682
+
683
  # =========================================================
 
 
 
 
 
 
 
 
 
 
 
684
 
685
+ def create_ui(manager: TTSManager):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
 
 
687
 
688
+
689
+ def ui_header():
690
+
691
+ return gr.HTML("""
692
+
693
+ <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
694
+
695
+ <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem; letter-spacing: -0.02em;">
696
+
697
+ 🎛️ CVNSS4.0 Vietnamese TTS Studio
698
+
699
+ </h1>
700
+
701
+ <div style="color: #94a3b8; font-size: 1rem; margin-top: 5px; font-weight: 400;">
702
+
703
+ Thiết kế bởi Long Ngo, 2026 • Phiên bản 1.0.1 Demo • Dự án mã nguồn mở, cố vấn Thầy Trần Tư Bình
704
+
705
+ </div>
706
+
707
  </div>
708
+
709
  """)
710
+
711
+
712
+
713
+ def ui_status_render(text, speaker, speed, chunks, dur, msg):
714
+
715
+ return f"""
716
+
717
+ <div class="statusCard">
718
+
719
+ <div style="margin-bottom:12px; font-weight:700; color:#38bdf8; font-size: 0.9rem; text-transform: uppercase;">
720
+
721
+ 📟 Trạng thái hoạt động
722
+
723
+ </div>
724
+
725
+ <div style="display:flex; flex-wrap:wrap; gap:8px;">
726
+
727
+ <span class="pill">🎤 {speaker}</span>
728
+
729
+ <span class="pill">⚡ {speed}x</span>
730
+
731
+ <span class="pill">📄 {len(text)} ký tự</span>
732
+
733
+ <span class="pill">🧩 {chunks} đoạn</span>
734
+
735
+ </div>
736
+
737
+ <div class="alert {'alertOk' if '✅' in msg else 'alertWarn'}">
738
+
739
+ {msg}
740
+
741
+ </div>
742
+
743
+ </div>
744
+
745
+ """
746
+
747
+
748
+
749
+ with gr.Blocks(theme=gr.themes.Base(), css=NEON_CSS, title="Neon TTS Expert") as app:
750
+
751
+ ui_header()
752
+
753
 
 
 
 
 
 
754
 
755
  with gr.Tabs():
756
+
757
+ # --- TAB BASIC ---
758
+
759
  with gr.Tab("⚡ Chế độ Nhanh"):
760
+
761
  with gr.Row():
762
+
763
+ # INPUT COLUMN
764
+
765
+ with gr.Column(scale=2):
766
+
767
+ # REFACTOR: Using a specific ID for the container to target with CSS isolation
768
+
769
+ with gr.Group(elem_classes=["panelNeon"], elem_id="input-panel-basic"):
770
+
771
+ gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
772
+
773
+
774
+
775
+ # THE FIX: Pure Textbox with updated styling (Dark Blue text)
776
+
777
+ txt_basic = gr.Textbox(
778
+
779
+ label="",
780
+
781
+ show_label=False,
782
+
783
+ placeholder="Nhập nội dung tiếng Việt vào... ( dụ: Xin chào, bạn đã học qua CVNSS4.0 chưa?)",
784
+
785
+ lines=6,
786
+
787
+ elem_id="main-input-basic"
788
+
789
+ )
790
+
791
+
792
+
793
+ with gr.Row():
794
+
795
+ spk_basic = gr.Dropdown(choices=manager.tts.spea