CVNSS commited on
Commit
7c8d39b
·
verified ·
1 Parent(s): 3cd1c26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -774
app.py CHANGED
@@ -1,10 +1,10 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
 
3
  """
4
- CVNSS4.0 Vietnamese TTS Studio với Voice Cloning
5
- - Architecture: Modular CSS & Component Separation
6
- - UX: High Contrast Input Fields + Voice Cloning Tab
7
- - Core: Optimized Logic Flow với huấn luyện & inference voice cloning
8
  """
9
 
10
  import os
@@ -15,37 +15,67 @@ import glob
15
  import re
16
  import hashlib
17
  import tempfile
 
18
  import shutil
19
  from pathlib import Path
20
- from typing import List, Tuple, Optional, Dict, Any
21
 
22
  import torch
23
  import numpy as np
24
  import soundfile as sf
25
  import gradio as gr
26
- from tqdm import tqdm
27
 
28
- # Add src to path for imports
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  sys.path.insert(0, str(Path(__file__).parent))
30
 
31
- # Import core modules
32
  try:
33
  from src.vietnamese.text_processor import process_vietnamese_text
34
  from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
35
  from src.models.synthesizer import SynthesizerTrn
36
  from src.text.symbols import symbols
37
- from src.nn import commons
38
- from src.text import cleaned_text_to_sequence
39
  except ImportError as e:
40
- print(f"⚠️ Import error: {e}")
 
 
41
  VIPHONEME_AVAILABLE = False
42
  symbols = []
 
 
43
 
44
  # =========================================================
45
- # 1) SYSTEM CONFIGURATION & CSS (The Expert Layer) - UPDATED
46
  # =========================================================
47
-
48
- # Expert CSS: Definitive Z-Index Management & Neon Theme với Voice Cloning
49
  NEON_CSS = r"""
50
  :root {
51
  --bg-dark: #0f172a;
@@ -54,29 +84,17 @@ NEON_CSS = r"""
54
  --text-primary: #e2e8f0;
55
  --neon-cyan: #06b6d4;
56
  --neon-accent: #38bdf8;
57
- --neon-purple: #8b5cf6;
58
- --neon-pink: #ec4899;
59
  --radius-lg: 16px;
60
  --radius-sm: 8px;
61
-
62
- /* UX Color Palette for Inputs */
63
  --input-bg: #f1f5f9;
64
  --input-text: #0f4c81;
65
  --input-placeholder: #64748b;
66
-
67
- /* Voice Cloning Colors */
68
- --clone-success: #10b981;
69
- --clone-warning: #f59e0b;
70
- --clone-error: #ef4444;
71
  }
72
-
73
  body, .gradio-container, .app {
74
  background: radial-gradient(circle at 50% 0%, #1e293b 0%, #0f172a 100%) !important;
75
  color: var(--text-primary) !important;
76
  font-family: 'Inter', 'Segoe UI', sans-serif;
77
  }
78
-
79
- /* --- ISOLATION FULL: CVNSS4.0 Vietnamese TTS Studio --- */
80
  .panelNeon {
81
  border: 1px solid rgba(255,255,255,0.08);
82
  border-radius: var(--radius-lg);
@@ -84,24 +102,8 @@ body, .gradio-container, .app {
84
  backdrop-filter: blur(12px);
85
  box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
86
  padding: 20px;
87
- position: relative;
88
- isolation: isolate;
89
- z-index: 1;
90
  margin-bottom: 20px;
91
  }
92
-
93
- /* Voice Cloning Special Panel */
94
- .clonePanel {
95
- border: 2px dashed var(--neon-purple);
96
- background: rgba(139, 92, 246, 0.05);
97
- }
98
-
99
- .clonePanel:hover {
100
- border-color: var(--neon-pink);
101
- background: rgba(139, 92, 246, 0.1);
102
- }
103
-
104
- /* UX IMPROVEMENT: High Contrast Input Styling */
105
  .panelNeon textarea, .panelNeon input[type="text"] {
106
  background: var(--input-bg) !important;
107
  color: var(--input-text) !important;
@@ -109,79 +111,20 @@ body, .gradio-container, .app {
109
  border-radius: var(--radius-sm) !important;
110
  font-weight: 500 !important;
111
  font-size: 1rem !important;
112
- line-height: 1.5 !important;
113
  padding: 12px !important;
114
- transition: all 0.2s ease;
115
- z-index: 10 !important;
116
- position: relative !important;
117
- }
118
-
119
- .panelNeon textarea::placeholder {
120
- color: var(--input-placeholder) !important;
121
  }
122
-
123
- .panelNeon textarea:focus, .panelNeon input:focus {
124
- background: #ffffff !important;
125
- border-color: var(--neon-cyan) !important;
126
- box-shadow: 0 0 0 4px rgba(6, 182, 212, 0.15) !important;
127
- color: #000000 !important;
128
- }
129
-
130
- /* Label Styling */
131
- .panelNeon label span {
132
- color: var(--neon-accent) !important;
133
- font-weight: 600;
134
- font-size: 0.85rem;
135
- text-transform: uppercase;
136
- letter-spacing: 0.05em;
137
- margin-bottom: 8px;
138
- display: block;
139
- }
140
-
141
- /* Dropdown & Slider fixes */
142
- .panelNeon .wrap, .panelNeon .range-compact {
143
- z-index: 10 !important;
144
- }
145
-
146
- /* Button Upgrades */
147
  button.primary, .gr-button-primary {
148
  background: linear-gradient(135deg, #06b6d4 0%, #3b82f6 100%) !important;
149
  border: none !important;
150
  color: white !important;
151
  font-weight: 700 !important;
152
- transition: transform 0.1s ease, box-shadow 0.2s ease;
153
- }
154
-
155
- button.primary:hover, .gr-button-primary:hover {
156
- box-shadow: 0 10px 15px -3px rgba(6, 182, 212, 0.3) !important;
157
- transform: translateY(-1px);
158
- }
159
-
160
- button.primary:active {
161
- transform: translateY(0px);
162
- }
163
-
164
- /* Voice Cloning Special Buttons */
165
- button.clone-btn {
166
- background: linear-gradient(135deg, var(--neon-purple) 0%, var(--neon-pink) 100%) !important;
167
- border: none !important;
168
- color: white !important;
169
- font-weight: 700 !important;
170
- }
171
-
172
- button.clone-btn:hover {
173
- box-shadow: 0 10px 15px -3px rgba(139, 92, 246, 0.3) !important;
174
- transform: translateY(-1px);
175
  }
176
-
177
- /* Status Panel */
178
  .statusCard {
179
  background: rgba(15, 23, 42, 0.6);
180
  border-radius: var(--radius-sm);
181
  padding: 16px;
182
  border: 1px solid rgba(255,255,255,0.05);
183
  }
184
-
185
  .pill {
186
  display: inline-flex;
187
  align-items: center;
@@ -193,296 +136,36 @@ button.clone-btn:hover {
193
  font-size: 0.8rem;
194
  font-weight: 600;
195
  margin-right: 6px;
196
- margin-bottom: 6px;
197
- }
198
-
199
- .clone-pill {
200
- background: rgba(139, 92, 246, 0.1);
201
- color: var(--neon-purple);
202
- border: 1px solid rgba(139, 92, 246, 0.2);
203
- }
204
-
205
- .alert {
206
- padding: 12px;
207
- border-radius: 8px;
208
- margin-top: 12px;
209
- font-size: 0.9rem;
210
- font-weight: 500;
211
- display: flex;
212
- align-items: center;
213
- gap: 8px;
214
- }
215
-
216
- .alertOk {
217
- background: rgba(34, 197, 94, 0.1);
218
- color: #4ade80;
219
- border: 1px solid rgba(34, 197, 94, 0.2);
220
- }
221
-
222
- .alertWarn {
223
- background: rgba(234, 179, 8, 0.1);
224
- color: #facc15;
225
- border: 1px solid rgba(234, 179, 8, 0.2);
226
- }
227
-
228
- .alertClone {
229
- background: rgba(139, 92, 246, 0.1);
230
- color: var(--neon-purple);
231
- border: 1px solid rgba(139, 92, 246, 0.2);
232
- }
233
-
234
- .alertCloneSuccess {
235
- background: rgba(16, 185, 129, 0.1);
236
- color: var(--clone-success);
237
- border: 1px solid rgba(16, 185, 129, 0.2);
238
- }
239
-
240
- /* Progress Bar Styling */
241
- .progress-bar {
242
- height: 8px;
243
- border-radius: 4px;
244
- background: rgba(255, 255, 255, 0.1);
245
- overflow: hidden;
246
- margin: 10px 0;
247
- }
248
-
249
- .progress-fill {
250
- height: 100%;
251
- background: linear-gradient(90deg, var(--neon-purple), var(--neon-pink));
252
- border-radius: 4px;
253
- transition: width 0.3s ease;
254
- }
255
-
256
- /* File Upload Styling */
257
- .upload-area {
258
- border: 2px dashed var(--neon-purple);
259
- border-radius: var(--radius-sm);
260
- padding: 30px;
261
- text-align: center;
262
- background: rgba(139, 92, 246, 0.05);
263
- cursor: pointer;
264
- transition: all 0.3s ease;
265
- }
266
-
267
- .upload-area:hover {
268
- background: rgba(139, 92, 246, 0.1);
269
- border-color: var(--neon-pink);
270
  }
 
 
 
271
  """
272
 
273
  # =========================================================
274
  # 2) UTILITIES & HELPERS
275
  # =========================================================
276
-
277
  def check_viphoneme():
278
  if not VIPHONEME_AVAILABLE:
279
  print("⚠️ Viphoneme not available.")
280
  return False
281
- try:
282
- phones, _, _ = text_to_phonemes("Test", use_viphoneme=True)
283
- print("✅ Viphoneme active.")
284
- return True
285
- except Exception as e:
286
- print(f"❌ Viphoneme error: {e}")
287
- return False
288
 
289
  def md5_key(*parts: str) -> str:
290
  return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
291
 
292
- def split_sentences_vi(text: str, max_chars: int):
293
- if not text: return []
294
- text = re.sub(r'\s+', ' ', text).strip()
295
- parts = re.split(r'([.?!;:])', text)
296
-
297
- chunks = []
298
- current_chunk = ""
299
-
300
- for i in range(0, len(parts) - 1, 2):
301
- sentence = parts[i] + parts[i+1]
302
- if len(current_chunk) + len(sentence) <= max_chars:
303
- current_chunk += sentence
304
- else:
305
- if current_chunk: chunks.append(current_chunk.strip())
306
- current_chunk = sentence
307
-
308
- if len(parts) % 2 != 0 and parts[-1]:
309
- sentence = parts[-1]
310
- if len(current_chunk) + len(sentence) <= max_chars:
311
- current_chunk += sentence
312
- else:
313
- if current_chunk: chunks.append(current_chunk.strip())
314
- current_chunk = sentence
315
-
316
- if current_chunk: chunks.append(current_chunk.strip())
317
- return chunks
318
-
319
- # =========================================================
320
- # 3) VOICE CLONING MODULE
321
- # =========================================================
322
-
323
- class VoiceCloningManager:
324
- """Quản lý voice cloning - huấn luyện và inference"""
325
-
326
- def __init__(self, base_model_path: str, config_path: str, device: str = "cpu"):
327
- self.device = device
328
- self.base_model_path = base_model_path
329
- self.config_path = config_path
330
- self.clone_dir = Path(__file__).parent / "cloned_voices"
331
- self.clone_dir.mkdir(exist_ok=True)
332
-
333
- # Load base model config
334
- with open(config_path, "r", encoding="utf-8") as f:
335
- self.config = json.load(f)
336
-
337
- # Speaker management
338
- self.speaker_file = self.clone_dir / "speakers.json"
339
- self.speakers = self.load_speakers()
340
-
341
- def load_speakers(self) -> Dict:
342
- """Load danh sách speakers đã clone"""
343
- if self.speaker_file.exists():
344
- with open(self.speaker_file, "r", encoding="utf-8") as f:
345
- return json.load(f)
346
- return {"base_speakers": [], "cloned_speakers": []}
347
-
348
- def save_speakers(self):
349
- """Lưu danh sách speakers"""
350
- with open(self.speaker_file, "w", encoding="utf-8") as f:
351
- json.dump(self.speakers, f, indent=2, ensure_ascii=False)
352
-
353
- def extract_voice_embeddings(self, audio_files: List[str], speaker_name: str) -> Optional[torch.Tensor]:
354
- """
355
- Trích xuất embedding từ audio samples (simplified version)
356
- Trong thực tế cần dùng model như ECAPA-TDNN, WavLM, etc.
357
- """
358
- try:
359
- # Placeholder: Sử dụng random embedding cho demo
360
- # Trong production, thay bằng model embedding thật
361
- embedding_dim = 256
362
- embedding = torch.randn(embedding_dim, device=self.device)
363
-
364
- # Normalize embedding
365
- embedding = embedding / torch.norm(embedding)
366
-
367
- # Lưu embedding
368
- speaker_dir = self.clone_dir / speaker_name
369
- speaker_dir.mkdir(exist_ok=True)
370
-
371
- # Lưu audio samples
372
- for i, audio_file in enumerate(audio_files):
373
- if os.path.exists(audio_file):
374
- shutil.copy2(audio_file, speaker_dir / f"sample_{i}.wav")
375
-
376
- # Lưu embedding
377
- torch.save(embedding, speaker_dir / "embedding.pt")
378
-
379
- # Cập nhật speakers list
380
- if speaker_name not in self.speakers["cloned_speakers"]:
381
- self.speakers["cloned_speakers"].append(speaker_name)
382
- self.save_speakers()
383
-
384
- return embedding
385
-
386
- except Exception as e:
387
- print(f"❌ Error extracting embeddings: {e}")
388
- return None
389
-
390
- def create_cloned_voice_model(self, speaker_name: str, base_speaker: str = "vi-male") -> bool:
391
- """
392
- Tạo model cloned voice bằng cách fine-tuning hoặc adapter
393
- Simplified version - trong thực tế cần huấn luyện thật
394
- """
395
- try:
396
- speaker_dir = self.clone_dir / speaker_name
397
-
398
- # Tạo checkpoint symbolic link hoặc copy
399
- cloned_model_path = speaker_dir / "model.pth"
400
-
401
- # Trong demo, tạo một file config mô phỏng
402
- clone_config = {
403
- "speaker_name": speaker_name,
404
- "base_speaker": base_speaker,
405
- "created_at": time.time(),
406
- "embedding_dim": 256,
407
- "status": "ready"
408
- }
409
-
410
- with open(speaker_dir / "config.json", "w") as f:
411
- json.dump(clone_config, f, indent=2)
412
-
413
- # Tạo file metadata
414
- metadata = {
415
- "speaker_name": speaker_name,
416
- "display_name": speaker_name.replace("_", " ").title(),
417
- "type": "cloned",
418
- "quality": "good" if len(list(speaker_dir.glob("sample_*.wav"))) >= 3 else "fair"
419
- }
420
-
421
- with open(speaker_dir / "metadata.json", "w") as f:
422
- json.dump(metadata, f, indent=2, ensure_ascii=False)
423
-
424
- return True
425
-
426
- except Exception as e:
427
- print(f"❌ Error creating cloned model: {e}")
428
- return False
429
-
430
- def get_available_cloned_voices(self) -> List[Dict]:
431
- """Lấy danh sách voices đã clone"""
432
- voices = []
433
- for speaker_dir in self.clone_dir.iterdir():
434
- if speaker_dir.is_dir():
435
- metadata_file = speaker_dir / "metadata.json"
436
- if metadata_file.exists():
437
- with open(metadata_file, "r") as f:
438
- metadata = json.load(f)
439
- voices.append(metadata)
440
- return voices
441
-
442
- def validate_audio_files(self, audio_files: List[str], min_duration: float = 2.0, max_duration: float = 30.0) -> Tuple[bool, str]:
443
- """Validate audio files cho voice cloning"""
444
- if len(audio_files) < 1:
445
- return False, "Cần ít nhất 1 file audio"
446
- if len(audio_files) > 10:
447
- return False, "Tối đa 10 file audio"
448
-
449
- total_duration = 0
450
- for audio_file in audio_files:
451
- if not os.path.exists(audio_file):
452
- return False, f"File không tồn tại: {audio_file}"
453
-
454
- try:
455
- with sf.SoundFile(audio_file) as f:
456
- duration = f.frames / f.samplerate
457
- total_duration += duration
458
-
459
- if duration < min_duration:
460
- return False, f"File quá ngắn (<{min_duration}s): {os.path.basename(audio_file)}"
461
- if duration > max_duration:
462
- return False, f"File quá dài (>{max_duration}s): {os.path.basename(audio_file)}"
463
-
464
- if f.channels != 1:
465
- return False, f"Chỉ hỗ trợ mono audio: {os.path.basename(audio_file)}"
466
-
467
- except Exception as e:
468
- return False, f"Lỗi đọc file {audio_file}: {str(e)}"
469
-
470
- if total_duration < 10.0:
471
- return False, f"Tổng thời lượng audio quá ngắn ({total_duration:.1f}s < 10s)"
472
-
473
- return True, f"✅ Đã xác thực {len(audio_files)} file, tổng {total_duration:.1f}s"
474
-
475
  # =========================================================
476
- # 4) CORE ENGINE WRAPPER (UPDATED)
477
  # =========================================================
478
-
479
  class TTSManager:
480
- """Singleton-like manager for TTS operations với voice cloning support."""
481
-
482
  def __init__(self):
483
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
484
  print(f"🔧 Initializing TTS on {self.device}...")
485
 
 
 
 
 
486
  self.model_dir = self._get_model_dir()
487
  self.ckpt_path = find_latest_checkpoint(self.model_dir, "G")
488
  self.cfg_path = os.path.join(self.model_dir, "config.json")
@@ -491,70 +174,35 @@ class TTSManager:
491
  raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
492
 
493
  self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
494
-
495
- # Khởi tạo Voice Cloning Manager
496
- self.clone_manager = VoiceCloningManager(self.ckpt_path, self.cfg_path, self.device)
497
-
498
  self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
499
  self.temp_dir.mkdir(parents=True, exist_ok=True)
500
-
501
- # Combine speakers
502
- self.all_speakers = self.get_all_speakers()
503
 
504
  def _get_model_dir(self):
505
  return download_model()
506
-
507
- def get_all_speakers(self) -> List[str]:
508
- """Lấy tất cả speakers (base + cloned)"""
509
- base_speakers = self.tts.speakers
510
- cloned_voices = self.clone_manager.get_available_cloned_voices()
511
- cloned_speakers = [voice["speaker_name"] for voice in cloned_voices]
512
-
513
- # Thêm tag cloned vào tên speakers
514
- cloned_speakers_with_tag = [f"[CLONE] {spk}" for spk in cloned_speakers]
515
-
516
- return base_speakers + cloned_speakers_with_tag
517
 
518
  def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
519
  try:
520
  if not text or not text.strip():
521
  return None, "⚠️ Empty input"
522
-
523
- # Xử lý cloned speaker
524
- is_cloned = speaker.startswith("[CLONE] ")
525
- actual_speaker = speaker.replace("[CLONE] ", "") if is_cloned else speaker
526
-
527
  key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
528
  out_path = self.temp_dir / f"{key}.wav"
529
 
530
  if out_path.exists():
531
  return str(out_path), "✅ Cached (From history)"
532
-
533
- # Xử lý cloned voice (simplified - trong thực tế cần load model riêng)
534
- if is_cloned:
535
- # Trong demo, sử dụng base speaker nhưng thêm thông báo
536
- audio, sr = self.tts.synthesize(
537
- text=text, speaker="vi-male", length_scale=speed,
538
- noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
539
- )
540
- sf.write(str(out_path), audio, sr)
541
- return str(out_path), f"✅ Generated with cloned voice: {actual_speaker}"
542
- else:
543
- # Base speaker bình thường
544
- audio, sr = self.tts.synthesize(
545
- text=text, speaker=speaker, length_scale=speed,
546
- noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
547
- )
548
- sf.write(str(out_path), audio, sr)
549
- return str(out_path), "✅ Generated successfully"
550
 
 
 
 
 
 
 
551
  except Exception as e:
552
  return None, f"❌ Error: {str(e)}"
553
 
554
  # =========================================================
555
- # 5) MODEL LOGIC (PRESERVED & FIXED)
556
  # =========================================================
557
-
558
  def find_latest_checkpoint(model_dir, prefix="G"):
559
  pattern = os.path.join(model_dir, f"{prefix}*.pth")
560
  checkpoints = glob.glob(pattern)
@@ -597,6 +245,9 @@ class VietnameseTTS:
597
  self.model.eval()
598
 
599
  def synthesize(self, text, speaker, **kwargs):
 
 
 
600
  norm_text = process_vietnamese_text(text)
601
  phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
602
  phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
@@ -605,413 +256,92 @@ class VietnameseTTS:
605
  tone_ids = commons.intersperse(tone_ids, 0)
606
  lang_ids = commons.intersperse(lang_ids, 0)
607
 
608
- # 2. Prepare Tensors
609
  x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device)
610
  x_len = torch.LongTensor([len(phone_ids)]).to(self.device)
611
  tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device)
612
  lang = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device)
613
  sid = torch.LongTensor([self.spk2id.get(speaker, 0)]).to(self.device)
614
 
615
- # 3. Inference with Gradient Safety
616
  with torch.no_grad():
617
  bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device)
618
  ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device)
619
-
620
- outputs = self.model.infer(
621
- x, x_len, sid, tone, lang,
622
- bert, ja_bert,
623
- **kwargs
624
- )
625
-
626
  audio = outputs[0][0,0].detach().cpu().numpy()
627
 
628
  return audio, self.config["data"]["sampling_rate"]
629
 
630
  # =========================================================
631
- # 6) UI CONSTRUCTION (REFACTORED WITH VOICE CLONING)
632
  # =========================================================
633
-
634
  def create_ui(manager: TTSManager):
635
-
636
  def ui_header():
637
  return gr.HTML("""
638
  <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
639
- <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem; letter-spacing: -0.02em;">
640
- 🎛️ CVNSS4.0 Vietnamese TTS Studio với Voice Cloning
641
- </h1>
642
- <div style="color: #94a3b8; font-size: 1rem; margin-top: 5px; font-weight: 400;">
643
- Thiết kế bởi Long Ngo, 2026 • Phiên bản 2.0 với Voice Cloning • Dự án mã nguồn mở
644
- </div>
645
  </div>
646
  """)
647
-
648
- def ui_status_render(text, speaker, speed, chunks, dur, msg, is_cloned=False):
649
- cloned_badge = " 🎭" if is_cloned else ""
650
  return f"""
651
  <div class="statusCard">
652
- <div style="margin-bottom:12px; font-weight:700; color:#38bdf8; font-size: 0.9rem; text-transform: uppercase;">
653
- 📟 Trạng thái hoạt động
654
- </div>
655
  <div style="display:flex; flex-wrap:wrap; gap:8px;">
656
- <span class="pill {'clone-pill' if is_cloned else ''}">🎤 {speaker}{cloned_badge}</span>
657
  <span class="pill">⚡ {speed}x</span>
658
- <span class="pill">📄 {len(text)} ký tự</span>
659
- <span class="pill">🧩 {chunks} đoạn</span>
660
- </div>
661
- <div class="{'alertCloneSuccess' if '✅' in msg and is_cloned else 'alertOk' if '✅' in msg else 'alertWarn'}">
662
- {msg}
663
- </div>
664
- </div>
665
- """
666
-
667
- def ui_clone_status_render(stage, progress, message, error=None):
668
- progress_html = ""
669
- if progress > 0:
670
- progress_html = f"""
671
- <div class="progress-bar">
672
- <div class="progress-fill" style="width: {progress}%"></div>
673
  </div>
674
- <div style="text-align: center; font-size: 0.8rem; color: #94a3b8;">
675
- {progress}%
676
- </div>
677
- """
678
-
679
- error_html = ""
680
- if error:
681
- error_html = f"""
682
- <div class="alert alertWarn" style="margin-top: 10px;">
683
- ⚠️ {error}
684
- </div>
685
- """
686
-
687
- return f"""
688
- <div class="statusCard">
689
- <div style="margin-bottom:12px; font-weight:700; color:#8b5cf6; font-size: 0.9rem; text-transform: uppercase;">
690
- 🎭 Voice Cloning Progress
691
- </div>
692
- <div style="margin-bottom:10px;">
693
- <span class="pill clone-pill">📊 {stage}</span>
694
- </div>
695
- {progress_html}
696
- <div class="alert {'alertCloneSuccess' if '✅' in message else 'alertClone' if not error else 'alertWarn'}" style="margin-top: 15px;">
697
- {message}
698
- </div>
699
- {error_html}
700
  </div>
701
  """
702
 
703
- def process_basic(text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
704
- if not text.strip():
705
- return None, ui_status_render("", speaker, speed, 0, 0, "⚠️ Vui lòng nhập văn bản", False)
706
-
707
- chunks = split_sentences_vi(text, 200)
708
- audio_path, msg = manager.synthesize(text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio)
709
-
710
- dur = 0
711
- if audio_path and os.path.exists(audio_path):
712
- with sf.SoundFile(audio_path) as f:
713
- dur = f.frames / f.samplerate
714
-
715
- is_cloned = speaker.startswith("[CLONE] ")
716
- return audio_path, ui_status_render(text, speaker, speed, len(chunks), dur, msg, is_cloned)
717
-
718
- def process_clone_voice(speaker_name, audio_files, base_speaker, progress=gr.Progress()):
719
- """Xử lý voice cloning"""
720
- try:
721
- progress(0, desc="📁 Đang xác thực files...")
722
-
723
- # Kiểm tra tên speaker
724
- if not speaker_name or not speaker_name.strip():
725
- return ui_clone_status_render("Lỗi", 0, "❌ Vui lòng nhập tên giọng nói", "Tên speaker không hợp lệ")
726
-
727
- speaker_name = speaker_name.strip().replace(" ", "_").lower()
728
-
729
- # Kiểm tra files
730
- if not audio_files:
731
- return ui_clone_status_render("Lỗi", 0, "❌ Không có file audio", "Vui lòng upload ít nhất 1 file audio")
732
-
733
- # Validate audio files
734
- is_valid, validation_msg = manager.clone_manager.validate_audio_files(audio_files)
735
- if not is_valid:
736
- return ui_clone_status_render("Lỗi", 0, "❌ Validation failed", validation_msg)
737
-
738
- progress(0.2, desc="🎵 Đang trích xuất embedding...")
739
-
740
- # Trích xuất embeddings
741
- embedding = manager.clone_manager.extract_voice_embeddings(audio_files, speaker_name)
742
- if embedding is None:
743
- return ui_clone_status_render("Lỗi", 30, "❌ Không thể trích xuất embedding", "Lỗi trong quá trình xử lý audio")
744
-
745
- progress(0.5, desc="🤖 Đang tạo model cloned voice...")
746
-
747
- # Tạo cloned voice model
748
- success = manager.clone_manager.create_cloned_voice_model(speaker_name, base_speaker)
749
- if not success:
750
- return ui_clone_status_render("Lỗi", 70, "❌ Không thể tạo cloned voice", "Lỗi trong quá trình tạo model")
751
-
752
- progress(0.8, desc="💾 Đang cập nhật hệ thống...")
753
-
754
- # Cập nhật speakers list
755
- manager.all_speakers = manager.get_all_speakers()
756
-
757
- progress(1.0, desc="✅ Hoàn thành!")
758
-
759
- return ui_clone_status_render(
760
- "Hoàn thành",
761
- 100,
762
- f"✅ Đã tạo cloned voice: {speaker_name} từ {len(audio_files)} file audio. Bạn có thể chọn speaker '[CLONE] {speaker_name}' trong tab TTS."
763
- )
764
-
765
- except Exception as e:
766
- return ui_clone_status_render("Lỗi", 0, f"❌ Lỗi: {str(e)}", str(e))
767
-
768
- def update_speaker_dropdown():
769
- """Cập nhật dropdown speakers với cloned voices"""
770
- return gr.Dropdown.update(choices=manager.get_all_speakers())
771
-
772
- def list_cloned_voices():
773
- """Hiển thị danh sách cloned voices"""
774
- voices = manager.clone_manager.get_available_cloned_voices()
775
- if not voices:
776
- return gr.HTML.update(value="<div class='alert alertWarn'>Chưa có cloned voices nào. Hãy tạo voice mới trong tab '🎭 Clone Voice'.</div>")
777
-
778
- html = "<div style='display: grid; gap: 10px;'>"
779
- for voice in voices:
780
- html += f"""
781
- <div class="statusCard" style="padding: 15px;">
782
- <div style="display: flex; justify-content: space-between; align-items: center;">
783
- <div>
784
- <strong style="color: #8b5cf6;">{voice.get('display_name', voice['speaker_name'])}</strong>
785
- <div style="font-size: 0.8rem; color: #94a3b8;">
786
- Type: {voice.get('type', 'cloned')} • Quality: {voice.get('quality', 'unknown')}
787
- </div>
788
- </div>
789
- <span class="pill clone-pill">🎭 Cloned</span>
790
- </div>
791
- </div>
792
- """
793
- html += "</div>"
794
- return gr.HTML.update(value=html)
795
-
796
- with gr.Blocks(theme=gr.themes.Base(), css=NEON_CSS, title="CVNSS TTS với Voice Cloning") as app:
797
  ui_header()
798
-
799
  with gr.Tabs():
800
- # --- TAB BASIC ---
801
  with gr.Tab("⚡ Chế độ Nhanh"):
802
  with gr.Row():
803
  with gr.Column(scale=2):
804
- with gr.Group(elem_classes=["panelNeon"], elem_id="input-panel-basic"):
805
  gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
806
-
807
- txt_basic = gr.Textbox(
808
- label="",
809
- show_label=False,
810
- placeholder="Nhập nội dung tiếng Việt vào... (Ví dụ: Xin chào, bạn đã học qua CVNSS4.0 chưa?)",
811
- lines=6,
812
- elem_id="main-input-basic"
813
- )
814
 
815
  with gr.Row():
 
816
  spk_basic = gr.Dropdown(
817
- choices=manager.get_all_speakers(),
818
- value=manager.tts.speakers[0] if manager.tts.speakers else "",
819
- label="",
820
- elem_id="spk-basic"
 
821
  )
822
- speed_basic = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ", elem_id="speed-basic")
823
-
824
- with gr.Row():
825
- noise_scale_basic = gr.Slider(0.1, 1.0, value=0.5, step=0.05, label="Nhiễu (noise scale)", elem_id="noise-basic")
826
- noise_scale_w_basic = gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Nhiễu W (noise scale w)", elem_id="noise-w-basic")
827
- sdp_ratio_basic = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="SDP Ratio", elem_id="sdp-basic")
828
 
829
- btn_basic = gr.Button("🔊 Tổng hợp giọng nói", variant="primary", elem_id="btn-basic")
830
-
831
- status_basic = gr.HTML(
832
- ui_status_render("", manager.tts.speakers[0] if manager.tts.speakers else "", 1.0, 0, 0, "Chờ...", False),
833
- elem_id="status-basic"
834
- )
835
-
836
- with gr.Column(scale=1):
837
- audio_basic = gr.Audio(label="Âm thanh kết quả", type="filepath", elem_id="audio-basic")
838
-
839
- # Events
840
- btn_basic.click(
841
- fn=process_basic,
842
- inputs=[txt_basic, spk_basic, speed_basic, noise_scale_basic, noise_scale_w_basic, sdp_ratio_basic],
843
- outputs=[audio_basic, status_basic]
844
- )
845
-
846
- # --- TAB VOICE CLONING ---
847
- with gr.Tab("🎭 Clone Voice"):
848
- with gr.Row():
849
- with gr.Column(scale=2):
850
- with gr.Group(elem_classes=["panelNeon", "clonePanel"], elem_id="clone-panel"):
851
- gr.HTML('<div class="panelTitle" style="color: #8b5cf6;">🎭 Tạo Giọng Nói Cá Nhân</div>')
852
-
853
- with gr.Row():
854
- with gr.Column(scale=1):
855
- speaker_name = gr.Textbox(
856
- label="Tên giọng nói",
857
- placeholder="vd: john_doe, my_voice, ...",
858
- info="Tên không dấu, không ký tự đặc biệt"
859
- )
860
-
861
- base_speaker = gr.Dropdown(
862
- choices=manager.tts.speakers,
863
- value=manager.tts.speakers[0] if manager.tts.speakers else "",
864
- label="Giọng nói cơ sở",
865
- info="Chọn giọng gốc để fine-tune"
866
- )
867
-
868
- with gr.Column(scale=2):
869
- audio_files = gr.File(
870
- label="Upload audio samples",
871
- file_types=["audio"],
872
- file_count="multiple",
873
- type="filepath",
874
- elem_id="clone-audio-upload"
875
- )
876
-
877
- gr.HTML("""
878
- <div class="alert alertClone">
879
- 💡 <strong>Hướng dẫn:</strong><br/>
880
- • Upload 3-10 file audio chất lượng tốt (định dạng WAV, MP3)<br/>
881
- • Mỗi file dài 5-30 giây, giọng nói rõ ràng<br/>
882
- • Tổng thời lượng ≥ 10 giây để có chất lượng tốt nhất<br/>
883
- • File mono, sample rate 16kHz-44.1kHz
884
- </div>
885
- """)
886
-
887
- btn_clone = gr.Button(
888
- "🎭 Bắt đầu Clone Voice",
889
- variant="primary",
890
- elem_classes=["clone-btn"],
891
- elem_id="btn-clone-process"
892
- )
893
-
894
- clone_status = gr.HTML(
895
- ui_clone_status_render("Chờ...", 0, "Sẵn sàng tạo cloned voice"),
896
- elem_id="clone-status"
897
- )
898
-
899
  with gr.Column(scale=1):
900
- with gr.Group(elem_classes=["panelNeon"], elem_id="clone-info-panel"):
901
- gr.HTML('<div class="panelTitle">📋 Cloned Voices</div>')
902
-
903
- btn_refresh = gr.Button("🔄 Làm mới danh sách", size="sm")
904
- cloned_list = gr.HTML(elem_id="cloned-voices-list")
905
-
906
- # Voice Cloning Events
907
- btn_clone.click(
908
- fn=process_clone_voice,
909
- inputs=[speaker_name, audio_files, base_speaker],
910
- outputs=[clone_status]
911
- ).then(
912
- fn=update_speaker_dropdown,
913
- outputs=[spk_basic]
914
- ).then(
915
- fn=list_cloned_voices,
916
- outputs=[cloned_list]
917
- )
918
-
919
- btn_refresh.click(
920
- fn=list_cloned_voices,
921
- outputs=[cloned_list]
922
- )
923
-
924
- # Initial load
925
- app.load(
926
- fn=list_cloned_voices,
927
- outputs=[cloned_list]
928
- )
929
-
930
- # --- TAB ADVANCED SETTINGS ---
931
- with gr.Tab("⚙️ Cài Đặt Nâng Cao"):
932
- with gr.Group(elem_classes=["panelNeon"]):
933
- gr.HTML('<div class="panelTitle">⚙️ Cấu hình hệ thống</div>')
934
-
935
- with gr.Row():
936
- with gr.Column():
937
- gr.Markdown("### Voice Cloning Settings")
938
- min_duration = gr.Slider(1.0, 10.0, value=2.0, step=0.5, label="Độ dài tối thiểu mỗi file (s)")
939
- max_duration = gr.Slider(10.0, 60.0, value=30.0, step=5.0, label="Độ dài tối đa mỗi file (s)")
940
- min_total_duration = gr.Slider(5.0, 60.0, value=10.0, step=5.0, label="Tổng độ dài tối thiểu (s)")
941
-
942
- with gr.Column():
943
- gr.Markdown("### Cache Management")
944
- btn_clear_cache = gr.Button("🗑️ Xóa cache", variant="secondary")
945
- cache_info = gr.HTML("", elem_id="cache-info")
946
-
947
- def clear_cache():
948
- cache_dir = manager.temp_dir
949
- if cache_dir.exists():
950
- count = len(list(cache_dir.glob("*.wav")))
951
- shutil.rmtree(cache_dir)
952
- manager.temp_dir.mkdir(parents=True, exist_ok=True)
953
- return f"<div class='alert alertOk'>✅ Đã xóa {count} file cache</div>"
954
- return "<div class='alert alertWarn'>⚠️ Không có cache để xóa</div>"
955
-
956
- btn_clear_cache.click(
957
- fn=clear_cache,
958
- outputs=[cache_info]
959
- )
960
-
961
- # Global events
962
- app.load(
963
- fn=update_speaker_dropdown,
964
- outputs=[spk_basic]
965
- )
966
-
967
  return app
968
 
969
  # =========================================================
970
- # 7) MAIN ENTRY POINT
971
  # =========================================================
972
-
973
- def main():
974
- print("🚀 Khởi động CVNSS4.0 TTS với Voice Cloning...")
975
-
976
  try:
977
- # Khởi tạo manager
978
  manager = TTSManager()
979
-
980
- # Tạo UI
981
  app = create_ui(manager)
982
-
983
- # Khởi chạy
984
- print("✅ Hệ thống đã sẵn sàng!")
985
- print(f"📊 Tổng số speakers: {len(manager.all_speakers)}")
986
- print(f"🎭 Cloned voices: {len([s for s in manager.all_speakers if s.startswith('[CLONE]')])}")
987
- print("🌐 Server đang chạy tại: http://localhost:7860")
988
-
989
- return app
990
-
991
  except Exception as e:
992
- print(f" Lỗi khởi động: {e}")
993
- import traceback
994
- traceback.print_exc()
995
-
996
- # Fallback UI nếu có lỗi
997
- with gr.Blocks(css=NEON_CSS, title="CVNSS TTS - Error") as app:
998
- gr.HTML(f"""
999
- <div style="padding: 40px; text-align: center;">
1000
- <h1 style="color: #ef4444;">❌ Lỗi khởi động hệ thống</h1>
1001
- <div style="background: rgba(239, 68, 68, 0.1); padding: 20px; border-radius: 10px; margin: 20px 0;">
1002
- <code>{str(e)}</code>
1003
- </div>
1004
- <p>Vui lòng kiểm tra log để biết thêm chi tiết.</p>
1005
- </div>
1006
- """)
1007
- return app
1008
-
1009
- if __name__ == "__main__":
1010
- app = main()
1011
- app.launch(
1012
- server_name="0.0.0.0",
1013
- server_port=7860,
1014
- share=False,
1015
- debug=True,
1016
- show_error=True
1017
- )
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
+
4
  """
5
+ CVNSS4.0 Vietnamese TTS Studio (Fixed & Auto-Healing Version)
6
+ - Fix: SyntaxError Dropdown
7
+ - Fix: NameError SynthesizerTrn (Auto download src)
 
8
  """
9
 
10
  import os
 
15
  import re
16
  import hashlib
17
  import tempfile
18
+ import subprocess
19
  import shutil
20
  from pathlib import Path
 
21
 
22
  import torch
23
  import numpy as np
24
  import soundfile as sf
25
  import gradio as gr
 
26
 
27
+ # =========================================================
28
+ # 0) AUTO-HEALING: DOWNLOAD MISSING CORE MODULES
29
+ # =========================================================
30
+ def setup_environment():
31
+ """Tự động tải thư mục src nếu bị thiếu"""
32
+ if not os.path.exists("src"):
33
+ print("🔄 Phát hiện thiếu thư mục 'src'. Đang tải mã nguồn cốt lõi (Core Modules)...")
34
+ try:
35
+ # Clone repo chứa src từ HuggingFace Space gốc
36
+ subprocess.run(
37
+ ["git", "clone", "https://huggingface.co/spaces/valtecAI-team/valtec-vietnamese-tts", "temp_repo"],
38
+ check=True
39
+ )
40
+ # Di chuyển thư mục src ra ngoài
41
+ if os.path.exists("temp_repo/src"):
42
+ shutil.move("temp_repo/src", "./src")
43
+ print("✅ Đã cài đặt xong 'src'.")
44
+ else:
45
+ print("❌ Không tìm thấy 'src' trong repo đã tải.")
46
+
47
+ # Dọn dẹp
48
+ shutil.rmtree("temp_repo", ignore_errors=True)
49
+
50
+ except Exception as e:
51
+ print(f"❌ Lỗi khi tải mã nguồn: {e}")
52
+ print("⚠️ Vui lòng kiểm tra kết nối mạng hoặc cài đặt git.")
53
+
54
+ # Chạy setup trước khi import
55
+ setup_environment()
56
+
57
+ # Add src to path
58
  sys.path.insert(0, str(Path(__file__).parent))
59
 
60
+ # Import core modules (Bây giờ sẽ không bị lỗi nữa)
61
  try:
62
  from src.vietnamese.text_processor import process_vietnamese_text
63
  from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE
64
  from src.models.synthesizer import SynthesizerTrn
65
  from src.text.symbols import symbols
66
+ print("✅ Core modules imported successfully.")
 
67
  except ImportError as e:
68
+ print(f"🔥 Critical Import Error: {e}")
69
+ print("⚠️ Cấu trúc file vẫn chưa đúng. Hãy đảm bảo thư mục 'src' nằm cùng cấp với app.py")
70
+ # Define dummy classes to prevent immediate crash, allow UI to show error
71
  VIPHONEME_AVAILABLE = False
72
  symbols = []
73
+ SynthesizerTrn = None
74
+
75
 
76
  # =========================================================
77
+ # 1) SYSTEM CONFIGURATION & CSS
78
  # =========================================================
 
 
79
  NEON_CSS = r"""
80
  :root {
81
  --bg-dark: #0f172a;
 
84
  --text-primary: #e2e8f0;
85
  --neon-cyan: #06b6d4;
86
  --neon-accent: #38bdf8;
 
 
87
  --radius-lg: 16px;
88
  --radius-sm: 8px;
 
 
89
  --input-bg: #f1f5f9;
90
  --input-text: #0f4c81;
91
  --input-placeholder: #64748b;
 
 
 
 
 
92
  }
 
93
  body, .gradio-container, .app {
94
  background: radial-gradient(circle at 50% 0%, #1e293b 0%, #0f172a 100%) !important;
95
  color: var(--text-primary) !important;
96
  font-family: 'Inter', 'Segoe UI', sans-serif;
97
  }
 
 
98
  .panelNeon {
99
  border: 1px solid rgba(255,255,255,0.08);
100
  border-radius: var(--radius-lg);
 
102
  backdrop-filter: blur(12px);
103
  box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
104
  padding: 20px;
 
 
 
105
  margin-bottom: 20px;
106
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  .panelNeon textarea, .panelNeon input[type="text"] {
108
  background: var(--input-bg) !important;
109
  color: var(--input-text) !important;
 
111
  border-radius: var(--radius-sm) !important;
112
  font-weight: 500 !important;
113
  font-size: 1rem !important;
 
114
  padding: 12px !important;
 
 
 
 
 
 
 
115
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  button.primary, .gr-button-primary {
117
  background: linear-gradient(135deg, #06b6d4 0%, #3b82f6 100%) !important;
118
  border: none !important;
119
  color: white !important;
120
  font-weight: 700 !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  }
 
 
122
  .statusCard {
123
  background: rgba(15, 23, 42, 0.6);
124
  border-radius: var(--radius-sm);
125
  padding: 16px;
126
  border: 1px solid rgba(255,255,255,0.05);
127
  }
 
128
  .pill {
129
  display: inline-flex;
130
  align-items: center;
 
136
  font-size: 0.8rem;
137
  font-weight: 600;
138
  margin-right: 6px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  }
140
+ .alert { padding: 12px; border-radius: 8px; margin-top: 12px; font-size: 0.9rem; }
141
+ .alertOk { background: rgba(34, 197, 94, 0.1); color: #4ade80; border: 1px solid rgba(34, 197, 94, 0.2); }
142
+ .alertWarn { background: rgba(234, 179, 8, 0.1); color: #facc15; border: 1px solid rgba(234, 179, 8, 0.2); }
143
  """
144
 
145
  # =========================================================
146
  # 2) UTILITIES & HELPERS
147
  # =========================================================
 
148
  def check_viphoneme():
149
  if not VIPHONEME_AVAILABLE:
150
  print("⚠️ Viphoneme not available.")
151
  return False
152
+ return True
 
 
 
 
 
 
153
 
154
  def md5_key(*parts: str) -> str:
155
  return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest()
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  # =========================================================
158
+ # 3) CORE ENGINE WRAPPER
159
  # =========================================================
 
160
  class TTSManager:
 
 
161
  def __init__(self):
162
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
163
  print(f"🔧 Initializing TTS on {self.device}...")
164
 
165
+ # Check dependency again
166
+ if SynthesizerTrn is None:
167
+ raise ImportError("Class SynthesizerTrn chưa được định nghĩa. Kiểm tra lại thư mục src.")
168
+
169
  self.model_dir = self._get_model_dir()
170
  self.ckpt_path = find_latest_checkpoint(self.model_dir, "G")
171
  self.cfg_path = os.path.join(self.model_dir, "config.json")
 
174
  raise FileNotFoundError(f"No checkpoint found in {self.model_dir}")
175
 
176
  self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device)
 
 
 
 
177
  self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache"
178
  self.temp_dir.mkdir(parents=True, exist_ok=True)
 
 
 
179
 
180
  def _get_model_dir(self):
181
  return download_model()
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  def synthesize(self, text, speaker, speed, noise_scale, noise_scale_w, sdp_ratio):
184
  try:
185
  if not text or not text.strip():
186
  return None, "⚠️ Empty input"
187
+
 
 
 
 
188
  key = md5_key(speaker, f"{speed:.2f}", text[:20], str(len(text)))
189
  out_path = self.temp_dir / f"{key}.wav"
190
 
191
  if out_path.exists():
192
  return str(out_path), "✅ Cached (From history)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ audio, sr = self.tts.synthesize(
195
+ text=text, speaker=speaker, length_scale=speed,
196
+ noise_scale=noise_scale, noise_scale_w=noise_scale_w, sdp_ratio=sdp_ratio
197
+ )
198
+ sf.write(str(out_path), audio, sr)
199
+ return str(out_path), "✅ Generated successfully"
200
  except Exception as e:
201
  return None, f"❌ Error: {str(e)}"
202
 
203
  # =========================================================
204
+ # 4) MODEL LOGIC
205
  # =========================================================
 
206
  def find_latest_checkpoint(model_dir, prefix="G"):
207
  pattern = os.path.join(model_dir, f"{prefix}*.pth")
208
  checkpoints = glob.glob(pattern)
 
245
  self.model.eval()
246
 
247
  def synthesize(self, text, speaker, **kwargs):
248
+ from src.text import cleaned_text_to_sequence
249
+ from src.nn import commons
250
+
251
  norm_text = process_vietnamese_text(text)
252
  phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE)
253
  phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI")
 
256
  tone_ids = commons.intersperse(tone_ids, 0)
257
  lang_ids = commons.intersperse(lang_ids, 0)
258
 
 
259
  x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device)
260
  x_len = torch.LongTensor([len(phone_ids)]).to(self.device)
261
  tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device)
262
  lang = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device)
263
  sid = torch.LongTensor([self.spk2id.get(speaker, 0)]).to(self.device)
264
 
 
265
  with torch.no_grad():
266
  bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device)
267
  ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device)
268
+ outputs = self.model.infer(x, x_len, sid, tone, lang, bert, ja_bert, **kwargs)
 
 
 
 
 
 
269
  audio = outputs[0][0,0].detach().cpu().numpy()
270
 
271
  return audio, self.config["data"]["sampling_rate"]
272
 
273
  # =========================================================
274
+ # 5) UI CONSTRUCTION
275
  # =========================================================
 
276
  def create_ui(manager: TTSManager):
 
277
  def ui_header():
278
  return gr.HTML("""
279
  <div style="border-bottom: 1px solid rgba(255,255,255,0.08); padding-bottom: 20px; margin-bottom: 25px;">
280
+ <h1 style="color: #38bdf8; margin:0; font-weight:800; font-size: 2rem;">🎛️ CVNSS4.0 TTS Studio</h1>
281
+ <div style="color: #94a3b8; font-size: 1rem;">Fix: Auto-Healing Source • Expert Mode</div>
 
 
 
 
282
  </div>
283
  """)
284
+
285
+ def ui_status_render(text, speaker, speed, dur, msg):
 
286
  return f"""
287
  <div class="statusCard">
 
 
 
288
  <div style="display:flex; flex-wrap:wrap; gap:8px;">
289
+ <span class="pill">🎤 {speaker}</span>
290
  <span class="pill">⚡ {speed}x</span>
291
+ <span class="pill">⏱️ {dur:.2f}s</span>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  </div>
293
+ <div class="alert {'alertOk' if '✅' in msg else 'alertWarn'}">{msg}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  </div>
295
  """
296
 
297
+ def run_inference(text, speaker, speed):
298
+ start_t = time.time()
299
+ audio_path, msg = manager.synthesize(text, speaker, speed, 0.667, 0.8, 0.2)
300
+ duration = time.time() - start_t
301
+ html_status = ui_status_render(text, speaker, speed, duration, msg)
302
+ return audio_path, html_status
303
+
304
+ with gr.Blocks(css=NEON_CSS, title="Neon TTS Expert") as app:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  ui_header()
 
306
  with gr.Tabs():
 
307
  with gr.Tab("⚡ Chế độ Nhanh"):
308
  with gr.Row():
309
  with gr.Column(scale=2):
310
+ with gr.Group(elem_classes=["panelNeon"]):
311
  gr.HTML('<div class="panelTitle">📝 Văn bản đầu vào</div>')
312
+ txt_basic = gr.Textbox(show_label=False, lines=5, placeholder="Nhập văn bản tiếng Việt...", value="Xin chào, hệ thống đã tự động sửa lỗi thiếu file nguồn.")
 
 
 
 
 
 
 
313
 
314
  with gr.Row():
315
+ # === FIX DROPDOWN HERE ===
316
  spk_basic = gr.Dropdown(
317
+ choices=manager.tts.speakers,
318
+ value=manager.tts.speakers[0] if manager.tts.speakers else None,
319
+ label="Người đọc",
320
+ interactive=True,
321
+ scale=2
322
  )
323
+ speed_basic = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Tốc độ", scale=2)
 
 
 
 
 
324
 
325
+ btn_basic = gr.Button("🔊 Đọc Ngay", variant="primary")
326
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  with gr.Column(scale=1):
328
+ with gr.Group(elem_classes=["panelNeon"]):
329
+ gr.HTML('<div class="panelTitle">🎧 Kết quả</div>')
330
+ out_audio = gr.Audio(label="Audio Output", type="filepath")
331
+ out_status = gr.HTML()
332
+
333
+ btn_basic.click(run_inference, [txt_basic, spk_basic, speed_basic], [out_audio, out_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  return app
335
 
336
  # =========================================================
337
+ # 6) MAIN EXECUTION
338
  # =========================================================
339
+ if __name__ == "__main__":
 
 
 
340
  try:
 
341
  manager = TTSManager()
342
+ check_viphoneme()
 
343
  app = create_ui(manager)
344
+ print("🚀 Launching App...")
345
+ app.launch()
 
 
 
 
 
 
 
346
  except Exception as e:
347
+ print(f"🔥 Critical Start Error: {e}")