CVNSS commited on
Commit
d0acdd5
·
verified ·
1 Parent(s): 5a15d93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -264
app.py CHANGED
@@ -2,11 +2,10 @@
2
  # -*- coding: utf-8 -*-
3
 
4
  """
5
- 💎 CVNSS4.0 Vietnamese TTS Studio - Azure Horizon Edition
6
- - Architecture: Singleton Pattern & Lazy Loading
7
- - Design System: Soft UI (Light Theme) & Ceramic Typography
8
- - Author: Long Ngo | Refactored by 100-Year AI Expert
9
- - Advisor: Trần Tư Bình
10
  """
11
 
12
  import os
@@ -14,33 +13,34 @@ import sys
14
  import json
15
  import time
16
  import re
17
- import hashlib
18
- import tempfile
19
  import logging
20
- import gc
 
21
  from pathlib import Path
22
- from typing import Optional, Tuple, List
23
 
24
  import torch
25
  import numpy as np
26
  import soundfile as sf
27
  import gradio as gr
 
28
 
29
- # --- 1. LOGGING & PATH SETUP ---
30
  logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
31
  logger = logging.getLogger("CVNSS_Studio")
32
 
33
- # Auto-detect root path
34
- try:
35
- ROOT_DIR = Path(__file__).resolve().parent
36
- except NameError:
37
- ROOT_DIR = Path.cwd()
38
-
39
  if str(ROOT_DIR) not in sys.path:
40
  sys.path.insert(0, str(ROOT_DIR))
41
 
42
- # --- 2. CORE MODULE LOADER ---
 
43
  try:
 
 
 
 
44
  from src.vietnamese.text_processor import process_vietnamese_text
45
  from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
46
  from src.models.synthesizer import SynthesizerTrn
@@ -48,27 +48,27 @@ try:
48
  from src.text import cleaned_text_to_sequence
49
  from src.nn import commons
50
  CORE_LOADED = True
51
- except ImportError as e:
52
- logger.error(f"❌ Core modules missing: {e}")
 
53
  CORE_LOADED = False
 
54
  VIPHONEME_AVAILABLE = False
55
  symbols = []
56
 
57
  # =========================================================
58
- # 3. ELEGANT LIGHT THEME CSS (The "Azure Horizon" Style)
59
  # =========================================================
60
  ELEGANT_CSS = r"""
61
  @import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;600;800&family=Roboto+Mono:wght@400;500&display=swap');
62
 
63
  :root {
64
  --primary-blue: #3b82f6;
65
- --soft-blue: #eff6ff;
66
  --text-dark: #1e293b;
67
  --text-gray: #64748b;
68
  --surface-white: #ffffff;
69
  --bg-gradient: linear-gradient(135deg, #f8fafc 0%, #e0f2fe 100%);
70
- --shadow-soft: 0 10px 30px -10px rgba(59, 130, 246, 0.15);
71
- --radius-xl: 24px;
72
  }
73
 
74
  body, .gradio-container {
@@ -77,100 +77,43 @@ body, .gradio-container {
77
  color: var(--text-dark) !important;
78
  }
79
 
80
- /* --- CARDS & PANELS --- */
81
  .elegant-card {
82
  background: var(--surface-white);
83
  border-radius: var(--radius-xl);
84
  border: 1px solid rgba(255, 255, 255, 0.8);
85
- box-shadow: var(--shadow-soft);
86
- padding: 30px;
87
- transition: transform 0.2s ease, box-shadow 0.2s ease;
88
- }
89
-
90
- .elegant-card:hover {
91
- transform: translateY(-2px);
92
- box-shadow: 0 20px 40px -10px rgba(59, 130, 246, 0.2);
93
  }
94
 
95
- /* --- TYPOGRAPHY --- */
96
  .header-title {
97
- font-weight: 800;
98
- font-size: 2.2rem;
99
- background: linear-gradient(to right, #0f172a, #3b82f6);
100
- -webkit-background-clip: text;
101
- -webkit-text-fill-color: transparent;
102
  letter-spacing: -0.03em;
103
- margin-bottom: 0.5rem;
104
- }
105
-
106
- .header-subtitle {
107
- color: var(--text-gray);
108
- font-size: 0.95rem;
109
- font-weight: 500;
110
- }
111
-
112
- /* --- INPUTS --- */
113
- textarea, input {
114
- background: #f1f5f9 !important;
115
- border: 2px solid transparent !important;
116
- border-radius: 12px !important;
117
- color: var(--text-dark) !important;
118
- font-size: 1rem !important;
119
- transition: all 0.3s ease !important;
120
- }
121
-
122
- textarea:focus, input:focus {
123
- background: #ffffff !important;
124
- border-color: var(--primary-blue) !important;
125
- box-shadow: 0 0 0 4px rgba(59, 130, 246, 0.1) !important;
126
  }
127
 
128
- /* --- BUTTONS --- */
129
  button.primary-btn {
130
  background: var(--primary-blue) !important;
131
  color: white !important;
132
- font-weight: 600 !important;
133
  border-radius: 12px !important;
134
  border: none !important;
135
- padding: 12px 24px !important;
136
- box-shadow: 0 4px 12px rgba(59, 130, 246, 0.3) !important;
137
- transition: all 0.2s ease !important;
138
- }
139
- button.primary-btn:hover {
140
- background: #2563eb !important;
141
- box-shadow: 0 6px 16px rgba(59, 130, 246, 0.4) !important;
142
- transform: translateY(-1px);
143
- }
144
- button.primary-btn:active { transform: translateY(0); }
145
-
146
- /* --- STATUS BADGES --- */
147
- .badge {
148
- display: inline-flex;
149
- align-items: center;
150
- padding: 6px 16px;
151
- border-radius: 99px;
152
- font-size: 0.85rem;
153
- font-weight: 600;
154
- font-family: 'Roboto Mono', monospace;
155
  }
156
- .badge-success { background: #dcfce7; color: #15803d; border: 1px solid #bbf7d0; }
157
- .badge-error { background: #fee2e2; color: #b91c1c; border: 1px solid #fecaca; }
158
- .badge-info { background: #e0f2fe; color: #0369a1; border: 1px solid #bae6fd; }
159
-
160
- /* --- TABS --- */
161
- .tabs { border-bottom: none !important; }
162
- .tab-nav button { font-weight: 600 !important; color: var(--text-gray) !important; }
163
- .tab-nav button.selected { color: var(--primary-blue) !important; border-bottom: 3px solid var(--primary-blue) !important; }
164
  """
165
 
166
  # =========================================================
167
- # 4. INTELLIGENT UTILITIES
168
  # =========================================================
169
  def split_text_smart(text: str, max_chars: int = 300) -> List[str]:
170
- """Cắt câu thông minh, giữ nguyên ngữ điệu tiếng Việt."""
171
  if not text: return []
172
  text = re.sub(r'\s+', ' ', text).strip()
173
- # Tách câu dựa trên dấu chấm câu nhưng giữ lại dấu
174
  raw = re.split(r'([.?!;:])', text)
175
  sentences = []
176
  current = ""
@@ -182,8 +125,7 @@ def split_text_smart(text: str, max_chars: int = 300) -> List[str]:
182
  else:
183
  current += part
184
  if current: sentences.append(current.strip())
185
-
186
- # Ghép lại thành chunk
187
  chunks = []
188
  chunk = ""
189
  for sent in sentences:
@@ -196,49 +138,89 @@ def split_text_smart(text: str, max_chars: int = 300) -> List[str]:
196
  return chunks if chunks else [text]
197
 
198
  # =========================================================
199
- # 5. ENGINE CORE (Singleton)
200
  # =========================================================
201
  class TTSManager:
202
- """Quản lý mô hình VITS với tối ưu hóa bộ nhớ."""
203
- def __init__(self, model_path):
204
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
205
- self.model_dir = Path(model_path)
206
- self.config_path = self.model_dir / "config.json"
207
- self._load_model()
208
-
209
- def _load_model(self):
210
- # Find latest checkpoint
211
- ckpts = sorted(list(self.model_dir.glob("G_*.pth")),
212
- key=lambda x: int(re.search(r'G_(\d+)', x.name).group(1)) if re.search(r'G_(\d+)', x.name) else 0,
213
- reverse=True)
214
- if not ckpts: raise FileNotFoundError("Không tìm thấy checkpoint G_*.pth")
215
 
216
- with open(self.config_path, "r", encoding="utf-8") as f:
217
- self.hps = json.load(f)
218
-
219
- self.spk2id = self.hps["data"]["spk2id"]
220
- self.speakers = sorted(list(self.spk2id.keys()))
221
-
222
- self.net_g = SynthesizerTrn(
223
- len(symbols),
224
- self.hps["data"]["filter_length"] // 2 + 1,
225
- self.hps["train"]["segment_size"] // self.hps["data"]["hop_length"],
226
- n_speakers=self.hps["data"]["n_speakers"],
227
- **self.hps["model"]
228
- ).to(self.device)
229
-
230
- ckpt = torch.load(ckpts[0], map_location=self.device)
231
- self.net_g.load_state_dict(ckpt['model'])
232
- self.net_g.eval()
233
- logger.info(f"✅ Model loaded on {self.device}")
234
-
235
- def infer(self, text, spk, speed, noise_scale, noise_width, sdp_ratio):
236
- if not text: return None, 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
- # Clean memory
239
  if self.device.type == 'cuda': torch.cuda.empty_cache()
240
 
241
- # Processing
242
  text_norm = process_vietnamese_text(text)
243
  phones, tones, _ = text_to_phonemes(text_norm, use_viphoneme=VIPHONEME_AVAILABLE)
244
  phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
@@ -251,173 +233,127 @@ class TTSManager:
251
  sid = torch.LongTensor([self.spk2id.get(spk, 0)]).to(self.device)
252
 
253
  outputs = self.net_g.infer(x, x_len, sid, tone, lang,
254
- noise_scale=noise_scale, noise_scale_w=noise_width,
255
- length_scale=speed, sdp_ratio=sdp_ratio)
256
 
257
- # Safe Detach
258
  audio = outputs[0][0, 0].data.cpu().float().numpy()
259
- del x, x_len, tone, lang, sid, outputs
260
-
261
- return audio, self.hps["data"]["sampling_rate"]
262
 
263
  # =========================================================
264
- # 6. MODEL FETCHING
265
- # =========================================================
266
- def setup_engine():
267
- cache_dir = Path.home() / ".cache" / "cvnss_vits"
268
- model_dir = cache_dir / "vits-vietnamese"
269
-
270
- if not (model_dir / "config.json").exists():
271
- print("⬇️ Đang tải mô hình CVNSS4.0 từ Server...")
272
- from huggingface_hub import snapshot_download
273
- snapshot_download(repo_id="valtecAI-team/valtec-tts-pretrained", local_dir=str(model_dir))
274
-
275
- return TTSManager(model_dir)
276
-
277
- # =========================================================
278
- # 7. UI CONSTRUCTION (Clean & Bright)
279
  # =========================================================
280
  def build_interface():
281
- engine = None
282
- if CORE_LOADED:
283
- try:
284
- engine = setup_engine()
285
- except Exception as e:
286
- logger.error(str(e))
287
 
288
- def run_inference(text, spk, speed, ns, nsw, sdp, is_long=False, chunk_size=300, pause=250, progress=gr.Progress()):
289
- if not engine: return None, "<span class='badge badge-error'>❌ Engine chưa sẵn sàng</span>"
 
290
 
 
 
291
  start_time = time.time()
292
-
293
  try:
 
 
 
294
  if not is_long:
295
- audio, sr = engine.infer(text, spk, speed, ns, nsw, sdp)
296
- full_audio = audio
297
  else:
298
  chunks = split_text_smart(text, chunk_size)
299
  segments = []
300
- sr = 22050
301
- silence = np.zeros(int(sr * pause / 1000))
302
 
303
  for i, chunk in enumerate(chunks):
304
- progress((i)/len(chunks), desc=f"Đang đọc: {chunk[:15]}...")
305
- a, r = engine.infer(chunk, spk, speed, ns, nsw, sdp)
306
- if a is not None:
307
- sr = r
308
- segments.append(a)
309
- segments.append(silence)
310
- full_audio = np.concatenate(segments) if segments else None
 
 
311
 
312
- if full_audio is None: return None, "<span class='badge badge-error'>❌ Không có âm thanh</span>"
313
 
 
314
  proc_time = time.time() - start_time
315
  dur = len(full_audio) / sr
316
-
317
- # Output HTML for Light Theme
318
- html = f"""
319
- <div style="display: flex; gap: 10px; margin-top: 10px;">
320
- <span class="badge badge-success">✅ Hoàn tất</span>
321
- <span class="badge badge-info">⏱️ Xử lý: {proc_time:.2f}s</span>
322
- <span class="badge badge-info">🔊 Độ dài: {dur:.1f}s</span>
323
- </div>
324
- """
325
-
326
- # Save to temp file needed for Gradio
327
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
328
  sf.write(fp.name, full_audio, sr)
329
- return fp.name, html
330
-
331
- except Exception as ex:
332
- return None, f"<span class='badge badge-error'>❌ Lỗi: {str(ex)}</span>"
333
 
334
- # --- UI LAYOUT ---
335
- speakers = engine.speakers if engine else ["Đang tải..."]
 
 
 
336
 
337
- with gr.Blocks(theme=gr.themes.Soft(), css=ELEGANT_CSS, title="CVNSS4.0 Studio") as app:
338
- # HEADER
339
  with gr.Row():
340
- with gr.Column(scale=1):
341
- gr.HTML("""
342
- <div style="padding: 20px 0;">
343
  <div class="header-title">CVNSS4.0 Studio</div>
344
- <div class="header-subtitle">
345
- Tác giả: <b>Long Ngo</b>Cố vấn: <b>Thầy Trần Bình</b><br>
346
- Công nghệ lõi: VITS 4.0 • Grandmaster Edition
347
  </div>
348
  </div>
349
  """)
350
-
 
 
 
 
 
 
 
 
 
351
  with gr.Tabs():
352
- # TAB 1: QUICK MODE
353
  with gr.Tab("⚡ Chế độ Nhanh"):
354
  with gr.Row():
355
- # Input Panel
356
- with gr.Column(scale=5, elem_classes="elegant-card"):
357
- txt_input = gr.Textbox(
358
- label="Nhập văn bản (Tiếng Việt)",
359
- placeholder="Xin chào, hôm nay là một ngày tuyệt vời...",
360
- lines=4,
361
- show_label=True
362
- )
363
  with gr.Row():
364
- spk_drop = gr.Dropdown(speakers, value=speakers[0], label="Giọng đọc", scale=2)
365
- spd_slider = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Tốc độ", scale=3)
366
-
367
- btn_fast = gr.Button("🔊 Đọc Ngay", elem_classes="primary-btn")
368
-
369
- # Output Panel
370
- with gr.Column(scale=4, elem_classes="elegant-card"):
371
- out_audio = gr.Audio(label="Kết quả", type="filepath", interactive=False)
372
- out_status = gr.HTML()
373
-
374
- # Event Binding
375
- btn_fast.click(
376
- fn=lambda t, s, sp: run_inference(t, s, sp, 0.667, 0.8, 0.2, False),
377
- inputs=[txt_input, spk_drop, spd_slider],
378
- outputs=[out_audio, out_status]
379
- )
380
-
381
- # TAB 2: PRO MODE
382
- with gr.Tab("💎 Chế độ Chuyên sâu"):
383
  with gr.Row():
384
- with gr.Column(scale=5, elem_classes="elegant-card"):
385
- txt_pro = gr.Textbox(label="Văn bản dài", placeholder="Dán nội dung bài báo hoặc truyện vào đây...", lines=8)
386
-
387
- with gr.Accordion("🛠️ Cấu hình Nâng cao", open=False):
388
- with gr.Row():
389
- ns = gr.Slider(0.1, 1.5, 0.667, label="Độ biến thiên (Noise Scale)")
390
- nsw = gr.Slider(0.1, 1.5, 0.8, label="Độ rộng âm (Noise Width)")
391
- with gr.Row():
392
- sdp = gr.Slider(0, 1, 0.2, label="Ngẫu nhiên (SDP)")
393
- chunk = gr.Slider(100, 1000, 300, step=50, label="Ngắt đoạn (Ký tự)")
394
- pause = gr.Slider(0, 1000, 250, label="Nghỉ câu (ms)")
395
-
396
- btn_pro = gr.Button("🚀 Xử lý Văn bản Dài", elem_classes="primary-btn")
397
-
398
- with gr.Column(scale=4, elem_classes="elegant-card"):
399
- out_audio_pro = gr.Audio(label="Audio Tổng hợp", type="filepath")
400
- out_status_pro = gr.HTML()
401
-
402
- btn_pro.click(
403
- fn=lambda t, s, sp, n, nw, sd, c, p: run_inference(t, s, sp, n, nw, sd, True, c, p),
404
- inputs=[txt_pro, spk_drop, spd_slider, ns, nsw, sdp, chunk, pause],
405
- outputs=[out_audio_pro, out_status_pro]
406
- )
407
 
408
  return app
409
 
410
- # =========================================================
411
- # 8. LAUNCHER (Fix lỗi Space Init)
412
- # =========================================================
413
  if __name__ == "__main__":
414
- try:
415
- if not (ROOT_DIR / "src").exists():
416
- print("⚠️ CẢNH BÁO: Chưa tìm thấy thư mục 'src'. Vui lòng upload đầy đủ mã nguồn!")
417
-
418
- ui = build_interface()
419
- # Launch với settings tối ưu cho Hugging Face Spaces
420
- ui.queue(max_size=10).launch(server_name="0.0.0.0", show_error=True)
421
-
422
- except Exception as e:
423
- print(f"❌ Lỗi khởi động: {e}")
 
2
  # -*- coding: utf-8 -*-
3
 
4
  """
5
+ 💎 CVNSS4.0 Vietnamese TTS Studio - Công nghệ giọng nói
6
+ - Compatibility: Valtec Source Structure
7
+ - Author: Long Ngo, 2026 | Phiên bản 1.0.1
8
+ - Advisor: Thầy Trần Bình
 
9
  """
10
 
11
  import os
 
13
  import json
14
  import time
15
  import re
 
 
16
  import logging
17
+ import tempfile
18
+ import shutil
19
  from pathlib import Path
20
+ from typing import Optional, List
21
 
22
  import torch
23
  import numpy as np
24
  import soundfile as sf
25
  import gradio as gr
26
+ from huggingface_hub import hf_hub_download
27
 
28
+ # --- 1. ROBUST LOGGING & PATH SETUP ---
29
  logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
30
  logger = logging.getLogger("CVNSS_Studio")
31
 
32
+ # Định vị thư mục gốc chính xác
33
+ ROOT_DIR = Path(__file__).resolve().parent
 
 
 
 
34
  if str(ROOT_DIR) not in sys.path:
35
  sys.path.insert(0, str(ROOT_DIR))
36
 
37
+ # --- 2. IMPORT HANDLER (CRITICAL FIX) ---
38
+ # Chúng ta sẽ thử import, nếu thiếu src sẽ báo lỗi rõ ràng
39
  try:
40
+ # Kiểm tra xem folder src có tồn tại không
41
+ if not (ROOT_DIR / "src").exists():
42
+ raise ImportError("Thư mục 'src' không tồn tại. Vui lòng upload folder src từ repo gốc!")
43
+
44
  from src.vietnamese.text_processor import process_vietnamese_text
45
  from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
46
  from src.models.synthesizer import SynthesizerTrn
 
48
  from src.text import cleaned_text_to_sequence
49
  from src.nn import commons
50
  CORE_LOADED = True
51
+ IMPORT_ERROR_MSG = ""
52
+ except Exception as e:
53
+ logger.error(f"❌ Core load failed: {e}")
54
  CORE_LOADED = False
55
+ IMPORT_ERROR_MSG = str(e)
56
  VIPHONEME_AVAILABLE = False
57
  symbols = []
58
 
59
  # =========================================================
60
+ # 3. ELEGANT CSS (AZURE HORIZON)
61
  # =========================================================
62
  ELEGANT_CSS = r"""
63
  @import url('https://fonts.googleapis.com/css2?family=Manrope:wght@400;600;800&family=Roboto+Mono:wght@400;500&display=swap');
64
 
65
  :root {
66
  --primary-blue: #3b82f6;
 
67
  --text-dark: #1e293b;
68
  --text-gray: #64748b;
69
  --surface-white: #ffffff;
70
  --bg-gradient: linear-gradient(135deg, #f8fafc 0%, #e0f2fe 100%);
71
+ --radius-xl: 20px;
 
72
  }
73
 
74
  body, .gradio-container {
 
77
  color: var(--text-dark) !important;
78
  }
79
 
 
80
  .elegant-card {
81
  background: var(--surface-white);
82
  border-radius: var(--radius-xl);
83
  border: 1px solid rgba(255, 255, 255, 0.8);
84
+ box-shadow: 0 10px 30px -10px rgba(59, 130, 246, 0.15);
85
+ padding: 24px;
 
 
 
 
 
 
86
  }
87
 
 
88
  .header-title {
89
+ font-weight: 800;
90
+ font-size: 2rem;
91
+ color: #0f172a;
 
 
92
  letter-spacing: -0.03em;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  }
94
 
 
95
  button.primary-btn {
96
  background: var(--primary-blue) !important;
97
  color: white !important;
 
98
  border-radius: 12px !important;
99
  border: none !important;
100
+ font-weight: 600 !important;
101
+ transition: 0.2s !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  }
103
+ button.primary-btn:hover { transform: translateY(-2px); box-shadow: 0 10px 20px -5px rgba(59, 130, 246, 0.4); }
104
+
105
+ .badge { display: inline-flex; align-items: center; padding: 4px 12px; border-radius: 99px; font-size: 0.8rem; font-weight: 600; margin-right: 5px;}
106
+ .badge-success { background: #dcfce7; color: #15803d; }
107
+ .badge-error { background: #fee2e2; color: #b91c1c; }
108
+ .badge-warn { background: #fef9c3; color: #854d0e; }
 
 
109
  """
110
 
111
  # =========================================================
112
+ # 4. UTILITIES & LOGIC
113
  # =========================================================
114
  def split_text_smart(text: str, max_chars: int = 300) -> List[str]:
 
115
  if not text: return []
116
  text = re.sub(r'\s+', ' ', text).strip()
 
117
  raw = re.split(r'([.?!;:])', text)
118
  sentences = []
119
  current = ""
 
125
  else:
126
  current += part
127
  if current: sentences.append(current.strip())
128
+
 
129
  chunks = []
130
  chunk = ""
131
  for sent in sentences:
 
138
  return chunks if chunks else [text]
139
 
140
  # =========================================================
141
+ # 5. ENGINE CORE (Auto-Downloading)
142
  # =========================================================
143
  class TTSManager:
144
+ def __init__(self):
 
145
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
146
+ self.net_g = None
147
+ self.hps = None
148
+ self.ready = False
149
+ self.status_msg = "Khởi tạo..."
 
 
 
 
 
 
150
 
151
+ # Tự động load ngay khi init
152
+ self._initialize_model()
153
+
154
+ def _download_file_if_missing(self, repo_id, filename, local_dir):
155
+ target_path = local_dir / filename
156
+ if not target_path.exists():
157
+ logger.info(f"⬇️ Đang tải {filename}...")
158
+ try:
159
+ # Tải về file tạm rồi move vào đúng chỗ để tránh lỗi cache
160
+ file_path = hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir)
161
+ return Path(file_path)
162
+ except Exception as e:
163
+ logger.error(f"Không tải được {filename}: {e}")
164
+ return None
165
+ return target_path
166
+
167
+ def _initialize_model(self):
168
+ try:
169
+ # 1. Định nghĩa thư mục chứa model cục b�� (để kiểm soát chắc chắn)
170
+ model_dir = ROOT_DIR / "model_cache"
171
+ model_dir.mkdir(exist_ok=True)
172
+
173
+ repo_id = "valtecAI-team/valtec-vietnamese-tts" # Repo gốc bạn cung cấp
174
+
175
+ # 2. Tải Config
176
+ cfg_path = self._download_file_if_missing(repo_id, "config.json", model_dir)
177
+
178
+ # 3. Tải Model (G_100000.pth hoặc file G mới nhất)
179
+ # Ở đây ta hardcode file G_100000.pth vì repo valtec thường dùng tên này hoặc tương tự
180
+ # Bạn có thể đổi tên file nếu repo update
181
+ ckpt_path = self._download_file_if_missing(repo_id, "G_100000.pth", model_dir)
182
+
183
+ if not cfg_path or not ckpt_path:
184
+ self.status_msg = "❌ Không tải được file model. Kiểm tra kết nối mạng."
185
+ return
186
+
187
+ # 4. Load Config
188
+ with open(cfg_path, "r", encoding="utf-8") as f:
189
+ self.hps = json.load(f)
190
+
191
+ self.spk2id = self.hps["data"]["spk2id"]
192
+ self.speakers = sorted(list(self.spk2id.keys()))
193
+
194
+ # 5. Load Network
195
+ if CORE_LOADED:
196
+ self.net_g = SynthesizerTrn(
197
+ len(symbols),
198
+ self.hps["data"]["filter_length"] // 2 + 1,
199
+ self.hps["train"]["segment_size"] // self.hps["data"]["hop_length"],
200
+ n_speakers=self.hps["data"]["n_speakers"],
201
+ **self.hps["model"]
202
+ ).to(self.device)
203
+
204
+ ckpt = torch.load(ckpt_path, map_location=self.device)
205
+ self.net_g.load_state_dict(ckpt['model'])
206
+ self.net_g.eval()
207
+ self.ready = True
208
+ self.status_msg = f"✅ Sẵn sàng ({self.device})"
209
+ logger.info("Engine Ready!")
210
+ else:
211
+ self.status_msg = "❌ Lỗi Import Core (src folder missing)"
212
+
213
+ except Exception as e:
214
+ self.ready = False
215
+ self.status_msg = f"❌ Lỗi Init: {str(e)}"
216
+ logger.error(self.status_msg)
217
+
218
+ def infer(self, text, spk, speed, ns, nsw, sdp):
219
+ if not self.ready:
220
+ raise RuntimeError(f"Engine chưa sẵn sàng: {self.status_msg}")
221
 
 
222
  if self.device.type == 'cuda': torch.cuda.empty_cache()
223
 
 
224
  text_norm = process_vietnamese_text(text)
225
  phones, tones, _ = text_to_phonemes(text_norm, use_viphoneme=VIPHONEME_AVAILABLE)
226
  phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
 
233
  sid = torch.LongTensor([self.spk2id.get(spk, 0)]).to(self.device)
234
 
235
  outputs = self.net_g.infer(x, x_len, sid, tone, lang,
236
+ noise_scale=ns, noise_scale_w=nsw,
237
+ length_scale=speed, sdp_ratio=sdp)
238
 
 
239
  audio = outputs[0][0, 0].data.cpu().float().numpy()
240
+ return audio, self.hps["data"]["sampling_rate"]
 
 
241
 
242
  # =========================================================
243
+ # 6. UI CONSTRUCTION
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  # =========================================================
245
  def build_interface():
246
+ # Khởi tạo Manager
247
+ manager = TTSManager()
 
 
 
 
248
 
249
+ def run_inference(text, spk, speed, ns, nsw, sdp, is_long, chunk_size, pause, progress=gr.Progress()):
250
+ if not manager.ready:
251
+ return None, f"<span class='badge badge-error'>{manager.status_msg}</span><br><small>{IMPORT_ERROR_MSG}</small>"
252
 
253
+ if not text: return None, "⚠️ Chưa nhập nội dung"
254
+
255
  start_time = time.time()
 
256
  try:
257
+ full_audio = None
258
+ sr = 0
259
+
260
  if not is_long:
261
+ full_audio, sr = manager.infer(text, spk, speed, ns, nsw, sdp)
 
262
  else:
263
  chunks = split_text_smart(text, chunk_size)
264
  segments = []
265
+ # Dummy sr, will be updated
266
+ sr = 22050
267
 
268
  for i, chunk in enumerate(chunks):
269
+ progress((i)/len(chunks), desc=f"Đoạn {i+1}/{len(chunks)}")
270
+ a, r = manager.infer(chunk, spk, speed, ns, nsw, sdp)
271
+ sr = r
272
+ segments.append(a)
273
+ if pause > 0:
274
+ segments.append(np.zeros(int(sr * pause / 1000)))
275
+
276
+ if segments:
277
+ full_audio = np.concatenate(segments)
278
 
279
+ if full_audio is None: return None, " Lỗi tạo âm thanh"
280
 
281
+ # Export
282
  proc_time = time.time() - start_time
283
  dur = len(full_audio) / sr
 
 
 
 
 
 
 
 
 
 
 
284
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
285
  sf.write(fp.name, full_audio, sr)
286
+ return fp.name, f"<span class='badge badge-success'>Hoàn thành: {dur:.1f}s</span>"
 
 
 
287
 
288
+ except Exception as e:
289
+ return None, f"<span class='badge badge-error'>Lỗi: {str(e)}</span>"
290
+
291
+ # --- LAYOUT ---
292
+ speaker_list = manager.speakers if manager.ready else ["Chưa tải model"]
293
 
294
+ with gr.Blocks(theme=gr.themes.Soft(), css=ELEGANT_CSS, title="CVNSS4.0 Auto-Fix") as app:
 
295
  with gr.Row():
296
+ with gr.Column():
297
+ gr.HTML(f"""
298
+ <div style="margin-bottom: 20px;">
299
  <div class="header-title">CVNSS4.0 Studio</div>
300
+ <div style="color: #64748b; font-size: 0.9rem;">
301
+ Long Ngo • Trần Bình Valtec TTS Core<br>
302
+ Trạng thái Engine: <b>{manager.status_msg}</b>
303
  </div>
304
  </div>
305
  """)
306
+ if not CORE_LOADED:
307
+ gr.HTML(f"""
308
+ <div style="background: #fee2e2; color: #b91c1c; padding: 10px; border-radius: 8px; margin-bottom: 10px;">
309
+ <b>⚠️ CẢNH BÁO QUAN TRỌNG:</b><br>
310
+ Không tìm thấy thư mục <code>src</code>. Engine không thể chạy.<br>
311
+ Vui lòng đảm bảo bạn đã upload thư mục <code>src</code> từ repo Valtec lên tab Files của Space.
312
+ <br><i>Chi tiết lỗi: {IMPORT_ERROR_MSG}</i>
313
+ </div>
314
+ """)
315
+
316
  with gr.Tabs():
317
+ # Tab Nhanh
318
  with gr.Tab("⚡ Chế độ Nhanh"):
319
  with gr.Row():
320
+ with gr.Column(scale=3, elem_classes="elegant-card"):
321
+ txt_input = gr.Textbox(label="Văn bản", placeholder="Nhập gì đó đi...", lines=3)
 
 
 
 
 
 
322
  with gr.Row():
323
+ spk_drp = gr.Dropdown(speaker_list, value=speaker_list[0] if speaker_list else None, label="Giọng")
324
+ spd_sld = gr.Slider(0.5, 2.0, 1.0, label="Tốc độ")
325
+ btn_run = gr.Button("🔊 Đọc Ngay", elem_classes="primary-btn")
326
+
327
+ with gr.Column(scale=2, elem_classes="elegant-card"):
328
+ out_aud = gr.Audio(label="Kết quả", type="filepath")
329
+ out_html = gr.HTML()
330
+
331
+ btn_run.click(lambda t, s, sp: run_inference(t, s, sp, 0.667, 0.8, 0.2, False, 0, 0),
332
+ [txt_input, spk_drp, spd_sld], [out_aud, out_html])
333
+
334
+ # Tab Chuyên sâu
335
+ with gr.Tab("💎 Chế độ Dài"):
 
 
 
 
 
 
336
  with gr.Row():
337
+ with gr.Column(scale=3, elem_classes="elegant-card"):
338
+ txt_long = gr.Textbox(label="Văn bản dài", lines=6)
339
+ with gr.Accordion("Cấu hình", open=False):
340
+ ns = gr.Slider(0.1, 1.5, 0.667, label="Noise Scale")
341
+ nsw = gr.Slider(0.1, 1.5, 0.8, label="Noise Width")
342
+ sdp = gr.Slider(0, 1, 0.2, label="SDP")
343
+ chunk = gr.Slider(100, 1000, 300, label="Ngắt câu ( tự)")
344
+ pause = gr.Slider(0, 1000, 250, label="Nghỉ (ms)")
345
+ btn_long = gr.Button("🚀 Xử lý", elem_classes="primary-btn")
346
+
347
+ with gr.Column(scale=2, elem_classes="elegant-card"):
348
+ out_long = gr.Audio(label="Audio", type="filepath")
349
+ out_html_long = gr.HTML()
350
+
351
+ btn_long.click(lambda t, s, sp, n, nw, sd, c, p: run_inference(t, s, sp, n, nw, sd, True, c, p),
352
+ [txt_long, spk_drp, spd_sld, ns, nsw, sdp, chunk, pause],
353
+ [out_long, out_html_long])
 
 
 
 
 
 
354
 
355
  return app
356
 
 
 
 
357
  if __name__ == "__main__":
358
+ ui = build_interface()
359
+ ui.queue().launch(server_name="0.0.0.0", show_error=True)