|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
CVNSS4.0 Vietnamese TTS Studio |
|
|
- Fix: Python 3.12 Compatibility (Missing 'imp' module) |
|
|
- Fix: Auto-download 'src' |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import types |
|
|
import importlib |
|
|
import importlib.util |
|
|
import hashlib |
|
|
import tempfile |
|
|
import json |
|
|
import time |
|
|
import glob |
|
|
import re |
|
|
import shutil |
|
|
import subprocess |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
import imp |
|
|
except ImportError: |
|
|
print("🔧 Đang vá lỗi tương thích Python 3.12+ (Mocking 'imp' module)...") |
|
|
imp = types.ModuleType('imp') |
|
|
imp.new_module = types.ModuleType |
|
|
imp.reload = importlib.reload |
|
|
|
|
|
|
|
|
def _find_module(name, path=None): |
|
|
spec = importlib.util.find_spec(name, path) |
|
|
if spec is None: |
|
|
raise ImportError(f"No module named {name}") |
|
|
return (None, spec.origin, ("", "", 0)) |
|
|
|
|
|
imp.find_module = _find_module |
|
|
|
|
|
|
|
|
def _load_module(name, file, pathname, description): |
|
|
spec = importlib.util.spec_from_file_location(name, pathname) |
|
|
module = importlib.util.module_from_spec(spec) |
|
|
spec.loader.exec_module(module) |
|
|
sys.modules[name] = module |
|
|
return module |
|
|
|
|
|
imp.load_module = _load_module |
|
|
sys.modules['imp'] = imp |
|
|
print("✅ Đã vá xong module 'imp'.") |
|
|
|
|
|
|
|
|
import torch |
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setup_environment(): |
|
|
"""Tự động tải thư mục src nếu bị thiếu""" |
|
|
if not os.path.exists("src"): |
|
|
print("🔄 Phát hiện thiếu thư mục 'src'. Đang tải mã nguồn cốt lõi...") |
|
|
try: |
|
|
|
|
|
subprocess.run( |
|
|
["git", "clone", "https://huggingface.co/spaces/valtecAI-team/valtec-vietnamese-tts", "temp_repo"], |
|
|
check=True |
|
|
) |
|
|
|
|
|
if os.path.exists("temp_repo/src"): |
|
|
if os.path.exists("src"): shutil.rmtree("src") |
|
|
shutil.move("temp_repo/src", "./src") |
|
|
print("✅ Đã cài đặt xong 'src'.") |
|
|
else: |
|
|
print("❌ Không tìm thấy 'src' trong repo đã tải.") |
|
|
|
|
|
shutil.rmtree("temp_repo", ignore_errors=True) |
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi khi tải mã nguồn: {e}") |
|
|
|
|
|
|
|
|
setup_environment() |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
from src.text.symbols import symbols |
|
|
from src.vietnamese.text_processor import process_vietnamese_text |
|
|
from src.vietnamese.phonemizer import text_to_phonemes, VIPHONEME_AVAILABLE |
|
|
from src.models.synthesizer import SynthesizerTrn |
|
|
print("✅ Core modules imported successfully.") |
|
|
except ImportError as e: |
|
|
print(f"🔥 Vẫn còn lỗi Import: {e}") |
|
|
|
|
|
VIPHONEME_AVAILABLE = False |
|
|
symbols = [] |
|
|
SynthesizerTrn = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
NEON_CSS = r""" |
|
|
:root { --bg-dark: #0f172a; --text-primary: #e2e8f0; --neon-cyan: #06b6d4; } |
|
|
body, .gradio-container, .app { background: radial-gradient(circle at 50% 0%, #1e293b 0%, #0f172a 100%) !important; color: white !important; } |
|
|
.panelNeon { background: rgba(30, 41, 59, 0.7); border: 1px solid rgba(255,255,255,0.08); border-radius: 16px; padding: 20px; margin-bottom: 20px; } |
|
|
.panelNeon textarea, .panelNeon input, .panelNeon select { background: #f1f5f9 !important; color: #0f4c81 !important; font-weight: 500; } |
|
|
button.primary { background: linear-gradient(135deg, #06b6d4 0%, #3b82f6 100%) !important; color: white; font-weight: bold; border: none; } |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def md5_key(*parts): return hashlib.md5("|".join(parts).encode("utf-8")).hexdigest() |
|
|
|
|
|
def find_latest_checkpoint(model_dir, prefix="G"): |
|
|
pattern = os.path.join(model_dir, f"{prefix}*.pth") |
|
|
checkpoints = glob.glob(pattern) |
|
|
if not checkpoints: return None |
|
|
checkpoints.sort(key=lambda x: int(re.search(rf"{prefix}(\d+)\.pth", x).group(1)) if re.search(rf"{prefix}(\d+)\.pth", x) else 0, reverse=True) |
|
|
return checkpoints[0] |
|
|
|
|
|
def download_model(): |
|
|
from huggingface_hub import snapshot_download |
|
|
hf_repo = "valtecAI-team/valtec-tts-pretrained" |
|
|
cache_base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache")) |
|
|
if os.name == "nt": cache_base = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) |
|
|
model_dir = cache_base / "valtec_tts" / "models" / "vits-vietnamese" |
|
|
|
|
|
if (model_dir / "config.json").exists() and list(model_dir.glob("G_*.pth")): |
|
|
return str(model_dir) |
|
|
|
|
|
print(f"⬇️ Downloading Model {hf_repo}...") |
|
|
try: |
|
|
snapshot_download(repo_id=hf_repo, local_dir=str(model_dir)) |
|
|
return str(model_dir) |
|
|
except Exception as e: |
|
|
print(f"Lỗi tải model: {e}") |
|
|
return str(model_dir) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TTSManager: |
|
|
def __init__(self): |
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"🔧 Initializing TTS on {self.device}...") |
|
|
|
|
|
if SynthesizerTrn is None: |
|
|
raise RuntimeError("Lỗi nghiêm trọng: Không thể tải class SynthesizerTrn do lỗi import.") |
|
|
|
|
|
self.model_dir = download_model() |
|
|
self.ckpt_path = find_latest_checkpoint(self.model_dir, "G") |
|
|
self.cfg_path = os.path.join(self.model_dir, "config.json") |
|
|
|
|
|
if not self.ckpt_path or not os.path.exists(self.cfg_path): |
|
|
raise FileNotFoundError("Không tìm thấy Model Checkpoint hoặc Config.") |
|
|
|
|
|
self.tts = VietnameseTTS(self.ckpt_path, self.cfg_path, self.device) |
|
|
self.temp_dir = Path(tempfile.gettempdir()) / "neon_tts_cache" |
|
|
self.temp_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
def synthesize(self, text, speaker, speed): |
|
|
try: |
|
|
if not text: return None, "⚠️ Chưa nhập nội dung" |
|
|
key = md5_key(speaker, str(speed), text[:30]) |
|
|
out_path = self.temp_dir / f"{key}.wav" |
|
|
|
|
|
if out_path.exists(): return str(out_path), "✅ Lấy từ Cache" |
|
|
|
|
|
audio, sr = self.tts.synthesize(text, speaker, speed) |
|
|
sf.write(str(out_path), audio, sr) |
|
|
return str(out_path), "✅ Tạo thành công" |
|
|
except Exception as e: |
|
|
return None, f"❌ Lỗi: {str(e)}" |
|
|
|
|
|
class VietnameseTTS: |
|
|
def __init__(self, ckpt, cfg, device="cpu"): |
|
|
self.device = device |
|
|
with open(cfg, "r", encoding="utf-8") as f: self.config = json.load(f) |
|
|
self.spk2id = self.config["data"]["spk2id"] |
|
|
self.speakers = list(self.spk2id.keys()) |
|
|
|
|
|
self.model = SynthesizerTrn( |
|
|
len(symbols), |
|
|
self.config["data"]["filter_length"] // 2 + 1, |
|
|
self.config["train"]["segment_size"] // self.config["data"]["hop_length"], |
|
|
n_speakers=self.config["data"]["n_speakers"], |
|
|
**self.config["model"] |
|
|
).to(self.device) |
|
|
|
|
|
state = torch.load(ckpt, map_location=self.device)["model"] |
|
|
self.model.load_state_dict({k.replace("module.", ""): v for k,v in state.items()}, strict=False) |
|
|
self.model.eval() |
|
|
|
|
|
def synthesize(self, text, speaker, speed): |
|
|
from src.text import cleaned_text_to_sequence |
|
|
from src.nn import commons |
|
|
|
|
|
norm_text = process_vietnamese_text(text) |
|
|
phones, tones, _ = text_to_phonemes(norm_text, use_viphoneme=VIPHONEME_AVAILABLE) |
|
|
phone_ids, tone_ids, lang_ids = cleaned_text_to_sequence(phones, tones, "VI") |
|
|
|
|
|
phone_ids = commons.intersperse(phone_ids, 0) |
|
|
tone_ids = commons.intersperse(tone_ids, 0) |
|
|
lang_ids = commons.intersperse(lang_ids, 0) |
|
|
|
|
|
x = torch.LongTensor(phone_ids).unsqueeze(0).to(self.device) |
|
|
x_len = torch.LongTensor([len(phone_ids)]).to(self.device) |
|
|
tone = torch.LongTensor(tone_ids).unsqueeze(0).to(self.device) |
|
|
lang = torch.LongTensor(lang_ids).unsqueeze(0).to(self.device) |
|
|
sid = torch.LongTensor([self.spk2id.get(speaker, 0)]).to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
bert = torch.zeros(1024, len(phone_ids)).unsqueeze(0).to(self.device) |
|
|
ja_bert = torch.zeros(768, len(phone_ids)).unsqueeze(0).to(self.device) |
|
|
outputs = self.model.infer(x, x_len, sid, tone, lang, bert, ja_bert, noise_scale=0.667, noise_scale_w=0.8, length_scale=speed) |
|
|
audio = outputs[0][0,0].detach().cpu().numpy() |
|
|
|
|
|
return audio, self.config["data"]["sampling_rate"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_ui(manager: TTSManager): |
|
|
def run(text, spk, spd): |
|
|
start = time.time() |
|
|
path, msg = manager.synthesize(text, spk, spd) |
|
|
dur = time.time() - start |
|
|
return path, f"<div style='padding:10px; color:#38bdf8'>⏱️ {dur:.2f}s | {msg}</div>" |
|
|
|
|
|
with gr.Blocks(css=NEON_CSS, title="Fixed TTS") as app: |
|
|
gr.Markdown("## 🎛️ CVNSS4.0 TTS (Python 3.12 Patched)") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
with gr.Group(elem_classes=["panelNeon"]): |
|
|
txt = gr.Textbox(label="Văn bản", value="Xin chào, hệ thống đã sửa lỗi thư viện imp.", lines=4) |
|
|
spk = gr.Dropdown(choices=manager.tts.speakers, value=manager.tts.speakers[0], label="Người đọc") |
|
|
spd = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Tốc độ") |
|
|
btn = gr.Button("Đọc ngay", variant="primary") |
|
|
with gr.Column(): |
|
|
out_audio = gr.Audio(label="Audio", type="filepath") |
|
|
out_msg = gr.HTML() |
|
|
|
|
|
btn.click(run, [txt, spk, spd], [out_audio, out_msg]) |
|
|
return app |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
manager = TTSManager() |
|
|
app = create_ui(manager) |
|
|
app.launch() |
|
|
except Exception as e: |
|
|
print(f"🔥 Lỗi khởi động cuối cùng: {e}") |