Spaces:

Arrcttacsrks
/

VieNeu-TTS-Run-On-CPU2

Running

App Files Files Community

pnnbao-ump commited on 1 day ago

Commit

d949d26

verified ·

1 Parent(s): 233afa6

Upload 16 files

Browse files

Files changed (16) hide show

Dockerfile +35 -0
config.yaml +65 -0
gradio_app.py +560 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/core_utils.cpython-312.pyc +0 -0
utils/__pycache__/normalize_text.cpython-312.pyc +0 -0
utils/__pycache__/phonemize_text.cpython-312.pyc +0 -0
utils/core_utils.py +47 -0
utils/normalize_text.py +408 -0
utils/phoneme_dict.json +0 -0
utils/phonemize_text.py +150 -0
vieneu_tts/__init__.py +4 -0
vieneu_tts/__pycache__/__init__.cpython-312.pyc +0 -0
vieneu_tts/__pycache__/vieneu_tts.cpython-312.pyc +0 -0
vieneu_tts/vieneu_tts.py +347 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM python:3.10-slim-bullseye
+# Cấu hình người dùng (Giữ lại từ template mặc định của HF)
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Thiết lập biến môi trường và thư mục làm việc
+ENV PIP_NO_CACHE_DIR=1
+ENV DEBIAN_FRONTEND=noninteractive
+WORKDIR /app
+# Chúng ta phải làm bước này trước khi cài Python libs
+COPY --chown=user packages.txt ./
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    $(cat packages.txt | xargs) \
+    git \
+    libgl1 \
+    libsm6 \
+    libxext6 \
+    && apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# 3. Cài đặt Python Libs:
+COPY --chown=user requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# 4. Sao chép tất cả các tệp còn lại (bao gồm app.py, config.yaml, utils/, etc.)
+# Đây là lệnh COPY chính cho toàn bộ mã nguồn của bạn.
+COPY --chown=user . /app
+# 5. Lệnh khởi chạy: Thay thế uvicorn bằng lệnh chạy ứng dụng Gradio của bạn (app.py)
+CMD ["python", "app.py"]
+# Lưu ý: Gradio thường chạy trên port 7860 mặc định, nên không cần chỉ định.

config.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+text_settings:
+  max_chars_per_chunk: 256
+  max_total_chars_streaming: 3000
+backbone_configs:
+  "VieNeu-TTS (GPU)":
+    repo: pnnbao-ump/VieNeu-TTS
+    supports_streaming: false
+    description: Chất lượng cao nhất, yêu cầu GPU
+  "VieNeu-TTS-q8-gguf":
+    repo: pnnbao-ump/VieNeu-TTS-q8-gguf
+    supports_streaming: true
+    description: Cân bằng giữa chất lượng và tốc độ
+  "VieNeu-TTS-q4-gguf":
+    repo: pnnbao-ump/VieNeu-TTS-q4-gguf
+    supports_streaming: true
+    description: Nhẹ nhất, phù hợp CPU
+codec_configs:
+  "NeuCodec (Standard)":
+    repo: neuphonic/neucodec
+    description: Codec chuẩn, tốc độ trung bình
+    use_preencoded: false
+  "NeuCodec ONNX (Fast CPU)":
+    repo: neuphonic/neucodec-onnx-decoder
+    description: Tối ưu cho CPU, cần pre-encoded codes
+    use_preencoded: true
+voice_samples:
+  "Tuyên (nam miền Bắc)":
+    audio: ./sample/Tuyên (nam miền Bắc).wav
+    text: ./sample/Tuyên (nam miền Bắc).txt
+    codes: ./sample/Tuyên (nam miền Bắc).pt
+  "Vĩnh (nam miền Nam)":
+    audio: ./sample/Vĩnh (nam miền Nam).wav
+    text: ./sample/Vĩnh (nam miền Nam).txt
+    codes: ./sample/Vĩnh (nam miền Nam).pt
+  "Bình (nam miền Bắc)":
+    audio: ./sample/Bình (nam miền Bắc).wav
+    text: ./sample/Bình (nam miền Bắc).txt
+    codes: ./sample/Bình (nam miền Bắc).pt
+  "Nguyên (nam miền Nam)":
+    audio: ./sample/Nguyên (nam miền Nam).wav
+    text: ./sample/Nguyên (nam miền Nam).txt
+    codes: ./sample/Nguyên (nam miền Nam).pt
+  "Sơn (nam miền Nam)":
+    audio: ./sample/Sơn (nam miền Nam).wav
+    text: ./sample/Sơn (nam miền Nam).txt
+    codes: ./sample/Sơn (nam miền Nam).pt
+  "Đoan (nữ miền Nam)":
+    audio: ./sample/Đoan (nữ miền Nam).wav
+    text: ./sample/Đoan (nữ miền Nam).txt
+    codes: ./sample/Đoan (nữ miền Nam).pt
+  "Ngọc (nữ miền Bắc)":
+    audio: ./sample/Ngọc (nữ miền Bắc).wav
+    text: ./sample/Ngọc (nữ miền Bắc).txt
+    codes: ./sample/Ngọc (nữ miền Bắc).pt
+  "Ly (nữ miền Bắc)":
+    audio: ./sample/Ly (nữ miền Bắc).wav
+    text: ./sample/Ly (nữ miền Bắc).txt
+    codes: ./sample/Ly (nữ miền Bắc).pt
+  "Dung (nữ miền Nam)":
+    audio: ./sample/Dung (nữ miền Nam).wav
+    text: ./sample/Dung (nữ miền Nam).txt
+    codes: ./sample/Dung (nữ miền Nam).pt

gradio_app.py ADDED Viewed

	@@ -0,0 +1,560 @@

+import gradio as gr
+import soundfile as sf
+import tempfile
+import torch
+from vieneu_tts import VieNeuTTS
+import os
+import time
+import numpy as np
+import re
+from typing import Generator
+import queue
+import threading
+import yaml
+from utils.core_utils import split_text_into_chunks
+print("⏳ Đang khởi động VieNeu-TTS...")
+# --- CONSTANTS & CONFIG ---
+CONFIG_PATH = os.path.join(os.path.dirname(__file__), "config.yaml")
+try:
+    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
+        _config = yaml.safe_load(f) or {}
+except Exception as e:
+    raise RuntimeError(f"Không thể đọc config.yaml: {e}")
+BACKBONE_CONFIGS = _config.get("backbone_configs", {})
+CODEC_CONFIGS = _config.get("codec_configs", {})
+VOICE_SAMPLES = _config.get("voice_samples", {})
+_text_settings = _config.get("text_settings", {})
+MAX_CHARS_PER_CHUNK = _text_settings.get("max_chars_per_chunk", 256)
+MAX_TOTAL_CHARS_STREAMING = _text_settings.get("max_total_chars_streaming", 3000)
+if not BACKBONE_CONFIGS or not CODEC_CONFIGS:
+    raise ValueError("config.yaml thiếu backbone_configs hoặc codec_configs")
+if not VOICE_SAMPLES:
+    raise ValueError("config.yaml thiếu voice_samples")
+# --- 1. MODEL CONFIGURATION ---
+# Global model instance
+tts = None
+current_backbone = None
+current_codec = None
+model_loaded = False  # ✨ THÊM STATE
+def load_model(backbone_choice, codec_choice, device_choice):
+    """Load model with specified configuration"""
+    global tts, current_backbone, current_codec, model_loaded
+    # ✨ Trả về nhiều outputs để update UI ngay lập tức
+    yield (
+        "⏳ Đang tải model, vui lòng đợi...",
+        gr.update(interactive=False),  # Disable nút "Bắt đầu"
+        gr.update(interactive=False)   # Disable nút "Tải Model"
+    )
+    try:
+        backbone_config = BACKBONE_CONFIGS[backbone_choice]
+        codec_config = CODEC_CONFIGS[codec_choice]
+        # Determine devices
+        if device_choice == "Auto":
+            if "GGUF" in backbone_choice:
+                backbone_device = "gpu" if torch.cuda.is_available() else "cpu"
+            else:
+                backbone_device = "cuda" if torch.cuda.is_available() else "cpu"
+            if "ONNX" in codec_choice:
+                codec_device = "cpu"
+            else:
+                codec_device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            backbone_device = device_choice.lower()
+            codec_device = device_choice.lower()
+            if "ONNX" in codec_choice:
+                codec_device = "cpu"
+        if "GGUF" in backbone_choice and backbone_device == "cuda":
+            backbone_device = "gpu"
+        print(f"📦 Đang tải model...")
+        print(f"   Backbone: {backbone_config['repo']} on {backbone_device}")
+        print(f"   Codec: {codec_config['repo']} on {codec_device}")
+        tts = VieNeuTTS(
+            backbone_repo=backbone_config["repo"],
+            backbone_device=backbone_device,
+            codec_repo=codec_config["repo"],
+            codec_device=codec_device
+        )
+        current_backbone = backbone_choice
+        current_codec = codec_choice
+        model_loaded = True  # ✨ Đánh dấu đã load xong
+        note_for_llama_cpp = "\n⚠️ Lưu ý: Nếu bạn chọn gpu (cuda) cho bản gguf cần phải cài đặt đúng theo hướng dẫn ở link này để tận dụng được GPU: https://pypi.org/project/llama-cpp-python/"
+        preencoded_note = "\n⚠️ Codec ONNX cần sử dụng pre-encoded codes (.pt files)" if codec_config['use_preencoded'] else ""
+        success_msg = (
+            f"✅ Model đã tải thành công!\n\n"
+            f"🦜 Model Device: {backbone_device.upper()}{note_for_llama_cpp}\n\n"
+            f"🎵 Codec Device: {codec_device.upper()}{preencoded_note}"
+        )
+        yield (
+            success_msg,
+            gr.update(interactive=True),   # ✨ Enable nút "Bắt đầu"
+            gr.update(interactive=True)    # Enable nút "Tải Model"
+        )
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        model_loaded = False
+        yield (
+            f"❌ Lỗi khi tải model: {str(e)}",
+            gr.update(interactive=False),  # Vẫn disable nút "Bắt đầu"
+            gr.update(interactive=True)    # Enable nút "Tải Model" để thử lại
+        )
+# --- 2. DATA & HELPERS ---
+GGUF_ALLOWED_VOICES = [
+    "Vĩnh (nam miền Nam)",
+    "Bình (nam miền Bắc)",
+    "Ngọc (nữ miền Bắc)",
+    "Dung (nữ miền Nam)",
+]
+def get_voice_options(backbone_choice: str):
+    """Filter voice options: GGUF only shows the 4 allowed voices."""
+    if "gguf" in backbone_choice:
+        return [v for v in GGUF_ALLOWED_VOICES if v in VOICE_SAMPLES]
+    return list(VOICE_SAMPLES.keys())
+def update_voice_dropdown(backbone_choice: str, current_voice: str):
+    options = get_voice_options(backbone_choice)
+    new_value = current_voice if current_voice in options else (options[0] if options else None)
+    return gr.update(choices=options, value=new_value)
+# --- 3. CORE LOGIC FUNCTIONS ---
+def load_reference_info(voice_choice):
+    if voice_choice in VOICE_SAMPLES:
+        audio_path = VOICE_SAMPLES[voice_choice]["audio"]
+        text_path = VOICE_SAMPLES[voice_choice]["text"]
+        try:
+            if os.path.exists(text_path):
+                with open(text_path, "r", encoding="utf-8") as f:
+                    ref_text = f.read()
+                return audio_path, ref_text
+            else:
+                return audio_path, "⚠️ Không tìm thấy file text mẫu."
+        except Exception as e:
+            return None, f"❌ Lỗi: {str(e)}"
+    return None, ""
+def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, generation_mode):
+    """Synthesis with model check"""
+    global tts, current_backbone, current_codec, model_loaded
+    # ✨ Kiểm tra model đã load chưa
+    if not model_loaded or tts is None:
+        yield None, "⚠️ Vui lòng tải model trước!"
+        return
+    if not text or text.strip() == "":
+        yield None, "⚠️ Vui lòng nhập văn bản!"
+        return
+    raw_text = text.strip()
+    codec_config = CODEC_CONFIGS[current_codec]
+    use_preencoded = codec_config['use_preencoded']
+    # Setup Reference
+    if mode_tab == "custom_mode":
+        if custom_audio is None or not custom_text:
+            yield None, "⚠️ Thiếu Audio hoặc Text mẫu custom."
+            return
+        ref_audio_path = custom_audio
+        ref_text_raw = custom_text
+        ref_codes_path = None
+    else:
+        if voice_choice not in VOICE_SAMPLES:
+            yield None, "⚠️ Vui lòng chọn giọng mẫu."
+            return
+        ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
+        ref_text_path = VOICE_SAMPLES[voice_choice]["text"]
+        ref_codes_path = VOICE_SAMPLES[voice_choice]["codes"]
+        if not os.path.exists(ref_audio_path):
+            yield None, "❌ Không tìm thấy file audio mẫu."
+            return
+        with open(ref_text_path, "r", encoding="utf-8") as f:
+            ref_text_raw = f.read()
+    yield None, "📄 Đang xử lý Reference..."
+    # Encode reference
+    try:
+        if use_preencoded and ref_codes_path and os.path.exists(ref_codes_path):
+            ref_codes = torch.load(ref_codes_path, map_location="cpu")
+        else:
+            ref_codes = tts.encode_reference(ref_audio_path)
+        if isinstance(ref_codes, torch.Tensor):
+            ref_codes = ref_codes.cpu().numpy()
+    except Exception as e:
+        yield None, f"❌ Lỗi xử lý reference: {e}"
+        return
+    text_chunks = split_text_into_chunks(raw_text, max_chars=MAX_CHARS_PER_CHUNK)
+    total_chunks = len(text_chunks)
+    # === STANDARD MODE ===
+    if generation_mode == "Standard (Một lần)":
+        yield None, f"🚀 Bắt đầu tổng hợp chế độ Standard ({total_chunks} đoạn)..."
+        all_audio_segments = []
+        sr = 24000
+        silence_pad = np.zeros(int(sr * 0.15), dtype=np.float32)
+        start_time = time.time()
+        try:
+            for i, chunk in enumerate(text_chunks):
+                yield None, f"⏳ Đang xử lý đoạn {i+1}/{total_chunks}..."
+                chunk_wav = tts.infer(chunk, ref_codes, ref_text_raw)
+                if chunk_wav is not None and len(chunk_wav) > 0:
+                    all_audio_segments.append(chunk_wav)
+                    if i < total_chunks - 1:
+                        all_audio_segments.append(silence_pad)
+            if not all_audio_segments:
+                yield None, "❌ Không sinh được audio nào."
+                return
+            yield None, "💾 Đang ghép file và lưu..."
+            final_wav = np.concatenate(all_audio_segments)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                sf.write(tmp.name, final_wav, sr)
+                output_path = tmp.name
+            process_time = time.time() - start_time
+            yield output_path, f"✅ Hoàn tất! (Tổng thời gian: {process_time:.2f}s)"
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            yield None, f"❌ Lỗi Standard Mode: {str(e)}"
+            return
+    # === STREAMING MODE ===
+    else:
+        sr = 24000
+        crossfade_samples = int(sr * 0.03)
+        audio_queue = queue.Queue(maxsize=100)
+        PRE_BUFFER_SIZE = 3
+        end_event = threading.Event()
+        error_event = threading.Event()
+        error_msg = ""
+        def producer_thread():
+            nonlocal error_msg
+            try:
+                previous_tail = None
+                chunk_count = 0
+                for i, chunk_text in enumerate(text_chunks):
+                    stream_gen = tts.infer_stream(chunk_text, ref_codes, ref_text_raw)
+                    for part_idx, audio_part in enumerate(stream_gen):
+                        if audio_part is None or len(audio_part) == 0:
+                            continue
+                        if previous_tail is not None and len(previous_tail) > 0:
+                            overlap = min(len(previous_tail), len(audio_part), crossfade_samples)
+                            if overlap > 0:
+                                fade_out = np.linspace(1.0, 0.0, overlap, dtype=np.float32)
+                                fade_in = np.linspace(0.0, 1.0, overlap, dtype=np.float32)
+                                blended = (audio_part[:overlap] * fade_in +
+                                         previous_tail[-overlap:] * fade_out)
+                                processed = np.concatenate([
+                                    previous_tail[:-overlap] if len(previous_tail) > overlap else np.array([]),
+                                    blended,
+                                    audio_part[overlap:]
+                                ])
+                            else:
+                                processed = np.concatenate([previous_tail, audio_part])
+                            tail_size = min(crossfade_samples, len(processed))
+                            previous_tail = processed[-tail_size:].copy()
+                            output_chunk = processed[:-tail_size] if len(processed) > tail_size else processed
+                        else:
+                            tail_size = min(crossfade_samples, len(audio_part))
+                            previous_tail = audio_part[-tail_size:].copy()
+                            output_chunk = audio_part[:-tail_size] if len(audio_part) > tail_size else audio_part
+                        if len(output_chunk) > 0:
+                            audio_queue.put((sr, output_chunk))
+                            chunk_count += 1
+                if previous_tail is not None and len(previous_tail) > 0:
+                    audio_queue.put((sr, previous_tail))
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                error_msg = str(e)
+                error_event.set()
+            finally:
+                end_event.set()
+                audio_queue.put(None)
+        threading.Thread(target=producer_thread, daemon=True).start()
+        yield (sr, np.zeros(int(sr * 0.05))), "🔄 Đang buffering..."
+        pre_buffer = []
+        while len(pre_buffer) < PRE_BUFFER_SIZE:
+            try:
+                item = audio_queue.get(timeout=5.0)
+                if item is None:
+                    break
+                pre_buffer.append(item)
+            except queue.Empty:
+                if error_event.is_set():
+                    yield None, f"❌ Lỗi: {error_msg}"
+                    return
+                break
+        full_audio_buffer = []
+        for sr, audio_data in pre_buffer:
+            full_audio_buffer.append(audio_data)
+            yield (sr, audio_data), "🔊 Đang phát..."
+        while True:
+            try:
+                item = audio_queue.get(timeout=0.05)
+                if item is None:
+                    break
+                sr, audio_data = item
+                full_audio_buffer.append(audio_data)
+                yield (sr, audio_data), "🔊 Đang phát..."
+            except queue.Empty:
+                if error_event.is_set():
+                    yield None, f"❌ Lỗi: {error_msg}"
+                    break
+                if end_event.is_set() and audio_queue.empty():
+                    break
+                continue
+        if full_audio_buffer:
+            final_wav = np.concatenate(full_audio_buffer)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+                sf.write(tmp.name, final_wav, sr)
+                yield tmp.name, "✅ Hoàn tất Streaming!"
+# --- 4. UI SETUP ---
+theme = gr.themes.Ocean(
+    primary_hue="indigo",
+    secondary_hue="cyan",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont('Inter'), 'ui-sans-serif', 'system-ui'],
+).set(
+    button_primary_background_fill="linear-gradient(90deg, #6366f1 0%, #0ea5e9 100%)",
+    button_primary_background_fill_hover="linear-gradient(90deg, #4f46e5 0%, #0284c7 100%)",
+)
+css = """
+.container { max-width: 1400px; margin: auto; }
+.header-box {
+    text-align: center;
+    margin-bottom: 25px;
+    padding: 25px;
+    background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
+    border-radius: 12px;
+    color: white;
+}
+.header-title {
+    font-size: 2.5rem;
+    font-weight: 800;
+    /* Bỏ hiệu ứng tô màu gradient ở đây và chuyển nó sang thẻ con */
+}
+.gradient-text {
+    background: -webkit-linear-gradient(45deg, #60A5FA, #22D3EE);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.header-icon {
+    color: white; /* Ép màu trắng */
+}
+.status-box {
+    font-weight: bold;
+    text-align: center;
+    border: none;
+    background: transparent;
+}
+.model-card {
+    background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%);
+    border-radius: 12px;
+    padding: 20px;
+    margin-bottom: 25px;
+    border: 1px solid #cbd5e1;
+}
+.model-card-title {
+    font-size: 1.1rem;
+    font-weight: 700;
+    color: #1e293b;
+    margin-bottom: 12px;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.model-card-content {
+    display: flex;
+    flex-wrap: wrap;
+    justify-content: center;
+    align-items: center;
+    gap: 15px;
+    font-size: 0.9rem;
+    text-align: center;
+}
+.model-card-item {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    gap: 6px;
+    color: #475569;
+}
+.model-card-link {
+    color: #3b82f6;
+    text-decoration: none;
+    font-weight: 500;
+    transition: color 0.2s;
+}
+.model-card-link:hover {
+    color: #2563eb;
+    text-decoration: underline;
+}
+"""
+EXAMPLES_LIST = [
+    ["Về miền Tây không chỉ để ngắm nhìn sông nước hữu tình, mà còn để cảm nhận tấm chân tình của người dân nơi đây.", "Vĩnh (nam miền Nam)"],
+    ["Hà Nội những ngày vào thu mang một vẻ đẹp trầm mặc và cổ kính đến lạ thường.", "Bình (nam miền Bắc)"],
+]
+with gr.Blocks(theme=theme, css=css, title="VieNeu-TTS") as demo:
+    with gr.Column(elem_classes="container"):
+        gr.HTML("""
+<div class="header-box">
+    <h1 class="header-title">
+        <span class="header-icon">🦜</span>
+        <span class="gradient-text">VieNeu-TTS Studio</span>
+    </h1>
+    <div class="model-card-content">
+        <div class="model-card-item">
+            <strong>Models:</strong>
+            <a href="https://huggingface.co/pnnbao-ump/VieNeu-TTS" target="_blank" class="model-card-link">VieNeu-TTS</a>
+            <span>•</span>
+            <a href="https://huggingface.co/pnnbao-ump/VieNeu-TTS-q4-gguf" target="_blank" class="model-card-link">Q4-GGUF</a>
+            <span>•</span>
+            <a href="https://huggingface.co/pnnbao-ump/VieNeu-TTS-q8-gguf" target="_blank" class="model-card-link">Q8-GGUF</a>
+        </div>
+        <div class="model-card-item">
+            <strong>Repository:</strong>
+            <a href="https://github.com/pnnbao97/VieNeu-TTS" target="_blank" class="model-card-link">GitHub</a>
+        </div>
+        <div class="model-card-item">
+            <strong>Tác giả:</strong>
+            <span>Phạm Nguyễn Ngọc Bảo</span>
+        </div>
+    </div>
+</div>
+        """)
+        # --- CONFIGURATION ---
+        with gr.Group():
+            with gr.Row():
+                backbone_select = gr.Dropdown(list(BACKBONE_CONFIGS.keys()), value="VieNeu-TTS (GPU)", label="🦜 Backbone")
+                codec_select = gr.Dropdown(list(CODEC_CONFIGS.keys()), value="NeuCodec (Standard)", label="🎵 Codec")
+                device_choice = gr.Radio(["Auto", "CPU", "CUDA"], value="Auto", label="🖥️ Device")
+            btn_load = gr.Button("🔄 Tải Model", variant="primary")
+            model_status = gr.Markdown("⏳ Chưa tải model.")
+        with gr.Row(elem_classes="container"):
+            # --- INPUT ---
+            with gr.Column(scale=3):
+                text_input = gr.Textbox(
+                    label=f"Văn bản (Streaming hỗ trợ tới {MAX_TOTAL_CHARS_STREAMING} ký tự, chia chunk {MAX_CHARS_PER_CHUNK} ký tự)",
+                    lines=4,
+                    value="Hà Nội, trái tim của Việt Nam, là một thành phố ngàn năm văn hiến với bề dày lịch sử và văn hóa độc đáo. Bước chân trên những con phố cổ kính quanh Hồ Hoàn Kiếm, du khách như được du hành ngược thời gian, chiêm ngưỡng kiến trúc Pháp cổ điển hòa quyện với nét kiến trúc truyền thống Việt Nam. Mỗi con phố trong khu phố cổ mang một tên gọi đặc trưng, phản ánh nghề thủ công truyền thống từng thịnh hành nơi đây như phố Hàng Bạc, Hàng Đào, Hàng Mã. Ẩm thực Hà Nội cũng là một điểm nhấn đặc biệt, từ tô phở nóng hổi buổi sáng, bún chả thơm lừng trưa hè, đến chè Thái ngọt ngào chiều thu. Những món ăn dân dã này đã trở thành biểu tượng của văn hóa ẩm thực Việt, được cả thế giới yêu mến. Người Hà Nội nổi tiếng với tính cách hiền hòa, lịch thiệp nhưng cũng rất cầu toàn trong từng chi tiết nhỏ, từ cách pha trà sen cho đến cách chọn hoa sen tây để thưởng trà.",
+                )
+                with gr.Tabs() as tabs:
+                    with gr.TabItem("👤 Preset", id="preset_mode"):
+                        initial_voices = get_voice_options("GGUF Q4")
+                        default_voice = initial_voices[0] if initial_voices else None
+                        voice_select = gr.Dropdown(initial_voices, value=default_voice, label="Giọng mẫu")
+                    with gr.TabItem("🎙️ Custom", id="custom_mode"):
+                        custom_audio = gr.Audio(label="File mẫu (.wav)", type="filepath")
+                        custom_text = gr.Textbox(label="Lời thoại mẫu")
+                generation_mode = gr.Radio(
+                    ["Standard (Một lần)"],
+                    value="Standard (Một lần)",
+                    label="Chế độ sinh"
+                )
+                current_mode = gr.Textbox(visible=False, value="preset_mode")
+                # ✨ NÚT BẮT ĐẦU - MẶC ĐỊNH DISABLE
+                btn_generate = gr.Button("🎵 Bắt đầu", variant="primary", size="lg", interactive=False)
+            # --- OUTPUT ---
+            with gr.Column(scale=2):
+                audio_output = gr.Audio(
+                    label="Kết quả",
+                    type="filepath",
+                    autoplay=True,
+                    show_download_button=True
+                )
+                status_output = gr.Textbox(label="Trạng thái", elem_classes="status-box")
+        # --- EVENT HANDLERS ---
+        def update_info(backbone):
+            return f"Streaming: {'✅' if BACKBONE_CONFIGS[backbone]['supports_streaming'] else '❌'}"
+        backbone_select.change(update_info, backbone_select, model_status)
+        backbone_select.change(update_voice_dropdown, [backbone_select, voice_select], voice_select)
+        tabs.children[0].select(lambda: "preset_mode", outputs=current_mode)
+        tabs.children[1].select(lambda: "custom_mode", outputs=current_mode)
+        # ✨ CẬP NHẬT EVENT HANDLER CHO NÚT LOAD
+        btn_load.click(
+            fn=load_model,
+            inputs=[backbone_select, codec_select, device_choice],
+            outputs=[model_status, btn_generate, btn_load]  # Update cả 3 components
+        )
+        btn_generate.click(
+            fn=synthesize_speech,
+            inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, generation_mode],
+            outputs=[audio_output, status_output]
+        )
+if __name__ == "__main__":
+    demo.queue().launch()

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (128 Bytes). View file

utils/__pycache__/core_utils.cpython-312.pyc ADDED Viewed

Binary file (1.9 kB). View file

utils/__pycache__/normalize_text.cpython-312.pyc ADDED Viewed

Binary file (23.9 kB). View file

utils/__pycache__/phonemize_text.cpython-312.pyc ADDED Viewed

Binary file (5.74 kB). View file

utils/core_utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import re
+from typing import List
+def split_text_into_chunks(text: str, max_chars: int = 256) -> List[str]:
+    """
+    Split raw text into chunks no longer than max_chars.
+    Preference is given to sentence boundaries; otherwise falls back to word-based splitting.
+    """
+    sentences = re.split(r"(?<=[\.\!\?\…])\s+", text.strip())
+    chunks: List[str] = []
+    buffer = ""
+    def flush_buffer():
+        nonlocal buffer
+        if buffer:
+            chunks.append(buffer.strip())
+            buffer = ""
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        if len(sentence) <= max_chars:
+            candidate = f"{buffer} {sentence}".strip() if buffer else sentence
+            if len(candidate) <= max_chars:
+                buffer = candidate
+            else:
+                flush_buffer()
+                buffer = sentence
+            continue
+        flush_buffer()
+        words = sentence.split()
+        current = ""
+        for word in words:
+            candidate = f"{current} {word}".strip() if current else word
+            if len(candidate) > max_chars and current:
+                chunks.append(current.strip())
+                current = word
+            else:
+                current = candidate
+        if current:
+            chunks.append(current.strip())
+    flush_buffer()
+    return [chunk for chunk in chunks if chunk]

utils/normalize_text.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import re
+class VietnameseTTSNormalizer:
+    """
+    A text normalizer for Vietnamese Text-to-Speech systems.
+    Converts numbers, dates, units, and special characters into readable Vietnamese text.
+    """
+    def __init__(self):
+        self.units = {
+            'km': 'ki lô mét', 'dm': 'đê xi mét', 'cm': 'xen ti mét',
+            'mm': 'mi li mét', 'nm': 'na nô mét', 'µm': 'mic rô mét',
+            'μm': 'mic rô mét', 'm': 'mét',
+            'kg': 'ki lô gam', 'g': 'gam', 'mg': 'mi li gam',
+            'km²': 'ki lô mét vuông', 'km2': 'ki lô mét vuông',
+            'm²': 'mét vuông', 'm2': 'mét vuông',
+            'cm²': 'xen ti mét vuông', 'cm2': 'xen ti mét vuông',
+            'mm²': 'mi li mét vuông', 'mm2': 'mi li mét vuông',
+            'ha': 'héc ta',
+            'km³': 'ki lô mét khối', 'km3': 'ki lô mét khối',
+            'm³': 'mét khối', 'm3': 'mét khối',
+            'cm³': 'xen ti mét khối', 'cm3': 'xen ti mét khối',
+            'mm³': 'mi li mét khối', 'mm3': 'mi li mét khối',
+            'l': 'lít', 'dl': 'đê xi lít', 'ml': 'mi li lít', 'hl': 'héc tô lít',
+            'v': 'vôn', 'kv': 'ki lô vôn', 'mv': 'mi li vôn',
+            'a': 'am pe', 'ma': 'mi li am pe', 'ka': 'ki lô am pe',
+            'w': 'oát', 'kw': 'ki lô oát', 'mw': 'mê ga oát', 'gw': 'gi ga oát',
+            'kwh': 'ki lô oát giờ', 'mwh': 'mê ga oát giờ', 'wh': 'oát giờ',
+            'ω': 'ôm', 'ohm': 'ôm', 'kω': 'ki lô ôm', 'mω': 'mê ga ôm',
+            'hz': 'héc', 'khz': 'ki lô héc', 'mhz': 'mê ga héc', 'ghz': 'gi ga héc',
+            'pa': 'pát cal', 'kpa': 'ki lô pát cal', 'mpa': 'mê ga pát cal',
+            'bar': 'ba', 'mbar': 'mi li ba', 'atm': 'át mốt phia', 'psi': 'pi ét xai',
+            'j': 'giun', 'kj': 'ki lô giun',
+            'cal': 'ca lo', 'kcal': 'ki lô ca lo',
+        }
+        self.digits = ['không', 'một', 'hai', 'ba', 'bốn',
+                      'năm', 'sáu', 'bảy', 'tám', 'chín']
+    def normalize(self, text):
+        """Main normalization pipeline."""
+        text = text.lower()
+        text = self._normalize_temperature(text)
+        text = self._normalize_currency(text)
+        text = self._normalize_percentage(text)
+        text = self._normalize_units(text)
+        text = self._normalize_time(text)
+        text = self._normalize_date(text)
+        text = self._normalize_phone(text)
+        text = self._normalize_numbers(text)
+        text = self._number_to_words(text)
+        text = self._normalize_special_chars(text)
+        text = self._normalize_whitespace(text)
+        return text
+    def _normalize_temperature(self, text):
+        """Convert temperature notation to words."""
+        text = re.sub(r'-(\d+(?:[.,]\d+)?)\s*°\s*c\b', r'âm \1 độ xê', text, flags=re.IGNORECASE)
+        text = re.sub(r'-(\d+(?:[.,]\d+)?)\s*°\s*f\b', r'âm \1 độ ép', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*°\s*c\b', r'\1 độ xê', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*°\s*f\b', r'\1 độ ép', text, flags=re.IGNORECASE)
+        text = re.sub(r'°', ' độ ', text)
+        return text
+    def _normalize_currency(self, text):
+        """Convert currency notation to words."""
+        def decimal_currency(match):
+            whole = match.group(1)
+            decimal = match.group(2)
+            unit = match.group(3)
+            decimal_words = ' '.join([self.digits[int(d)] for d in decimal])
+            unit_map = {'k': 'nghìn', 'm': 'triệu', 'b': 'tỷ'}
+            unit_word = unit_map.get(unit.lower(), unit)
+            return f"{whole} phẩy {decimal_words} {unit_word}"
+        text = re.sub(r'(\d+)[.,](\d+)\s*([kmb])\b', decimal_currency, text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+)\s*k\b', r'\1 nghìn', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+)\s*m\b', r'\1 triệu', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+)\s*b\b', r'\1 tỷ', text, flags=re.IGNORECASE)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*đ\b', r'\1 đồng', text)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*vnd\b', r'\1 đồng', text, flags=re.IGNORECASE)
+        text = re.sub(r'\$\s*(\d+(?:[.,]\d+)?)', r'\1 đô la', text)
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*\$', r'\1 đô la', text)
+        return text
+    def _normalize_percentage(self, text):
+        """Convert percentage to words."""
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*%', r'\1 phần trăm', text)
+        return text
+    def _normalize_units(self, text):
+        """Convert measurement units to words."""
+        def expand_compound_with_number(match):
+            number = match.group(1)
+            unit1 = match.group(2).lower()
+            unit2 = match.group(3).lower()
+            full_unit1 = self.units.get(unit1, unit1)
+            full_unit2 = self.units.get(unit2, unit2)
+            return f"{number} {full_unit1} trên {full_unit2}"
+        def expand_compound_without_number(match):
+            unit1 = match.group(1).lower()
+            unit2 = match.group(2).lower()
+            full_unit1 = self.units.get(unit1, unit1)
+            full_unit2 = self.units.get(unit2, unit2)
+            return f"{full_unit1} trên {full_unit2}"
+        text = re.sub(r'(\d+(?:[.,]\d+)?)\s*([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\b',
+                     expand_compound_with_number, text)
+        text = re.sub(r'\b([a-zA-Zμµ²³°]+)/([a-zA-Zμµ²³°0-9]+)\b',
+                     expand_compound_without_number, text)
+        sorted_units = sorted(self.units.items(), key=lambda x: len(x[0]), reverse=True)
+        for unit, full_name in sorted_units:
+            pattern = r'(\d+(?:[.,]\d+)?)\s*' + re.escape(unit) + r'\b'
+            text = re.sub(pattern, rf'\1 {full_name}', text, flags=re.IGNORECASE)
+        for unit, full_name in sorted_units:
+            if any(c in unit for c in '²³°'):
+                pattern = r'\b' + re.escape(unit) + r'\b'
+                text = re.sub(pattern, full_name, text, flags=re.IGNORECASE)
+        return text
+    def _normalize_time(self, text):
+        """Convert time notation to words with validation."""
+        def validate_and_convert_time(match):
+            """Validate time components before converting."""
+            groups = match.groups()
+            # HH:MM:SS format
+            if len(groups) == 3:
+                hour, minute, second = groups
+                hour_int, minute_int, second_int = int(hour), int(minute), int(second)
+                # Validate ranges
+                if not (0 <= hour_int <= 23):
+                    return match.group(0)  # Return original if invalid
+                if not (0 <= minute_int <= 59):
+                    return match.group(0)
+                if not (0 <= second_int <= 59):
+                    return match.group(0)
+                return f"{hour} giờ {minute} phút {second} giây"
+            # HH:MM or HHhMM format
+            elif len(groups) == 2:
+                hour, minute = groups
+                hour_int, minute_int = int(hour), int(minute)
+                # Validate ranges
+                if not (0 <= hour_int <= 23):
+                    return match.group(0)
+                if not (0 <= minute_int <= 59):
+                    return match.group(0)
+                return f"{hour} giờ {minute} phút"
+            # HHh format
+            else:
+                hour = groups[0]
+                hour_int = int(hour)
+                if not (0 <= hour_int <= 23):
+                    return match.group(0)
+                return f"{hour} giờ"
+        # Apply patterns with validation
+        text = re.sub(r'(\d{1,2}):(\d{2}):(\d{2})', validate_and_convert_time, text)
+        text = re.sub(r'(\d{1,2}):(\d{2})', validate_and_convert_time, text)
+        text = re.sub(r'(\d{1,2})h(\d{2})', validate_and_convert_time, text)
+        text = re.sub(r'(\d{1,2})h\b', validate_and_convert_time, text)
+        return text
+    def _normalize_date(self, text):
+        """Convert date notation to words with validation."""
+        def is_valid_date(day, month, year):
+            """Check if date components are valid."""
+            day, month, year = int(day), int(month), int(year)
+            # Basic range checks
+            if not (1 <= day <= 31):
+                return False
+            if not (1 <= month <= 12):
+                return False
+            return True
+        def date_to_text(match):
+            day, month, year = match.groups()
+            if is_valid_date(day, month, year):
+                return f"ngày {day} tháng {month} năm {year}"
+            return match.group(0)  # Return original if invalid
+        def date_iso_to_text(match):
+            year, month, day = match.groups()
+            if is_valid_date(day, month, year):
+                return f"ngày {day} tháng {month} năm {year}"
+            return match.group(0)
+        def date_short_year(match):
+            day, month, year = match.groups()
+            full_year = f"20{year}" if int(year) < 50 else f"19{year}"
+            if is_valid_date(day, month, full_year):
+                return f"ngày {day} tháng {month} năm {full_year}"
+            return match.group(0)
+        # Apply patterns with validation
+        text = re.sub(r'\bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b',
+                    lambda m: date_to_text(m).replace('ngày ngày', 'ngày'), text)
+        text = re.sub(r'\bngày\s+(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b',
+                    lambda m: date_short_year(m).replace('ngày ngày', 'ngày'), text)
+        text = re.sub(r'\b(\d{4})-(\d{1,2})-(\d{1,2})\b', date_iso_to_text, text)
+        text = re.sub(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{4})\b', date_to_text, text)
+        text = re.sub(r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2})\b', date_short_year, text)
+        return text
+    def _normalize_phone(self, text):
+        """Convert phone numbers to digit-by-digit reading."""
+        def phone_to_text(match):
+            phone = match.group(0)
+            phone = re.sub(r'[^\d]', '', phone)
+            if phone.startswith('84') and len(phone) >= 10:
+                phone = '0' + phone[2:]
+            if 10 <= len(phone) <= 11:
+                words = [self.digits[int(d)] for d in phone]
+                return ' '.join(words) + ' '
+            return match.group(0)
+        text = re.sub(r'(\+84|84)[\s\-\.]?\d[\d\s\-\.]{7,}', phone_to_text, text)
+        text = re.sub(r'\b0\d[\d\s\-\.]{8,}', phone_to_text, text)
+        return text
+    def _normalize_numbers(self, text):
+        text = re.sub(r'(\d+(?:[,.]\d+)?)%', lambda m: f'{m.group(1)} phần trăm', text)
+        # 1. Xóa dấu thousand separator trước
+        text = re.sub(r'(\d{1,3})(?:\.(\d{3}))+', lambda m: m.group(0).replace('.', ''), text)
+        # 2. Chuyển số thập phân thành chữ
+        def decimal_to_words(match):
+            whole = match.group(1)
+            decimal = match.group(2)
+            decimal_words = ' '.join([self.digits[int(d)] for d in decimal])
+            separator = 'phẩy' if ',' in match.group(0) else 'chấm'
+            return f"{whole} {separator} {decimal_words}"
+        # 2a. Dấu phẩy
+        text = re.sub(r'(\d+),(\d+)', decimal_to_words, text)
+        # 2b. Dấu chấm (1-2 chữ số thập phân)
+        text = re.sub(r'(\d+)\.(\d{1,2})\b', decimal_to_words, text)
+        return text
+    def _read_two_digits(self, n):
+        """Read two-digit numbers in Vietnamese."""
+        if n < 10:
+            return self.digits[n]
+        elif n == 10:
+            return "mười"
+        elif n < 20:
+            if n == 15:
+                return "mười lăm"
+            return f"mười {self.digits[n % 10]}"
+        else:
+            tens = n // 10
+            ones = n % 10
+            if ones == 0:
+                return f"{self.digits[tens]} mươi"
+            elif ones == 1:
+                return f"{self.digits[tens]} mươi mốt"
+            elif ones == 5:
+                return f"{self.digits[tens]} mươi lăm"
+            else:
+                return f"{self.digits[tens]} mươi {self.digits[ones]}"
+    def _read_three_digits(self, n):
+        """Read three-digit numbers in Vietnamese."""
+        if n < 100:
+            return self._read_two_digits(n)
+        hundreds = n // 100
+        remainder = n % 100
+        result = f"{self.digits[hundreds]} trăm"
+        if remainder == 0:
+            return result
+        elif remainder < 10:
+            result += f" lẻ {self.digits[remainder]}"
+        else:
+            result += f" {self._read_two_digits(remainder)}"
+        return result
+    def _convert_number_to_words(self, num):
+        """Convert a number to Vietnamese words."""
+        if num == 0:
+            return "không"
+        if num < 0:
+            return f"âm {self._convert_number_to_words(-num)}"
+        if num >= 1000000000:
+            billion = num // 1000000000
+            remainder = num % 1000000000
+            result = f"{self._read_three_digits(billion)} tỷ"
+            if remainder > 0:
+                result += f" {self._convert_number_to_words(remainder)}"
+            return result
+        elif num >= 1000000:
+            million = num // 1000000
+            remainder = num % 1000000
+            result = f"{self._read_three_digits(million)} triệu"
+            if remainder > 0:
+                result += f" {self._convert_number_to_words(remainder)}"
+            return result
+        elif num >= 1000:
+            thousand = num // 1000
+            remainder = num % 1000
+            result = f"{self._read_three_digits(thousand)} nghìn"
+            if remainder > 0:
+                if remainder < 100:
+                    result += f" không trăm {self._read_two_digits(remainder)}"
+                else:
+                    result += f" {self._read_three_digits(remainder)}"
+            return result
+        else:
+            return self._read_three_digits(num)
+    def _number_to_words(self, text):
+        """Convert all remaining numbers to words."""
+        def convert_number(match):
+            num = int(match.group(0))
+            return self._convert_number_to_words(num)
+        text = re.sub(r'\b\d+\b', convert_number, text)
+        return text
+    def _normalize_special_chars(self, text):
+        """Handle special characters."""
+        text = text.replace('&', ' và ')
+        text = text.replace('+', ' cộng ')
+        text = text.replace('=', ' bằng ')
+        text = text.replace('#', ' thăng ')
+        text = re.sub(r'[\[\]\(\)\{\}]', ' ', text)
+        text = re.sub(r'\s+[-–—]+\s+', ' ', text)
+        text = re.sub(r'\.{2,}', ' ', text)
+        text = re.sub(r'\s+\.\s+', ' ', text)
+        text = re.sub(r'[^\w\sàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđ.,!?;:@%]', ' ', text)
+        return text
+    def _normalize_whitespace(self, text):
+        """Normalize whitespace."""
+        text = re.sub(r'\s+', ' ', text)
+        text = text.strip()
+        return text
+if __name__ == "__main__":
+    normalizer = VietnameseTTSNormalizer()
+    test_texts = [
+        "Giá 2.500.000đ (giảm 50%), mua trước 14h30 ngày 15/12/2025",
+        "Liên hệ: 0912-345-678 hoặc email@example.com",
+        "Tốc độ 120km/h, trọng lượng 75kg",
+        "Nhiệt độ 36,5°C, độ ẩm 80%",
+        "Số pi = 3,14159",
+        "Giá trị tăng 2.5M, đạt 10B",
+        "Nhiệt độ -15°C vào mùa đông",
+        "Điện áp 220V, công suất 2.5kW, tần số 50Hz",
+        "Tôi đi lấy l nước về nhà",
+        "Cần 5l nước cho công thức này",
+        "Vận tốc ánh sáng 299792km/s",
+        "Mật độ dân số 450 người/km2",
+        "Công suất 100 W/m2",
+        "Hôm nay 2025-01-15",
+        "Gọi +84 912 345 678",
+        "Nhiệt độ 25°C lúc 14:30:45",
+        "Ngày 15/12/25",
+        "Giá 3.140.159",
+    ]
+    print("=" * 80)
+    print("VIETNAMESE TTS NORMALIZATION TEST")
+    print("=" * 80)
+    for text in test_texts:
+        print(f"\n📝 Input: {text}")
+        normalized = normalizer.normalize(text)
+        print(f"🎵 Output: {normalized}")
+        print("-" * 80)

utils/phoneme_dict.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/phonemize_text.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import json
+import platform
+import glob
+from phonemizer import phonemize
+from phonemizer.backend.espeak.espeak import EspeakWrapper
+from utils.normalize_text import VietnameseTTSNormalizer
+# Configuration
+PHONEME_DICT_PATH = os.getenv(
+    'PHONEME_DICT_PATH',
+    os.path.join(os.path.dirname(__file__), "phoneme_dict.json")
+)
+def load_phoneme_dict(path=PHONEME_DICT_PATH):
+    """Load phoneme dictionary from JSON file."""
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except FileNotFoundError:
+        raise FileNotFoundError(
+            f"Phoneme dictionary not found at {path}. "
+            "Please create it or set PHONEME_DICT_PATH environment variable."
+        )
+def setup_espeak_library():
+    """Configure eSpeak library path based on operating system."""
+    system = platform.system()
+    if system == "Windows":
+        _setup_windows_espeak()
+    elif system == "Linux":
+        _setup_linux_espeak()
+    elif system == "Darwin":
+        _setup_macos_espeak()
+    else:
+        raise OSError(
+            f"Unsupported OS: {system}. "
+            "Only Windows, Linux, and macOS are supported."
+        )
+def _setup_windows_espeak():
+    """Setup eSpeak for Windows."""
+    default_path = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
+    if os.path.exists(default_path):
+        EspeakWrapper.set_library(default_path)
+    else:
+        raise FileNotFoundError(
+            f"eSpeak library not found at {default_path}. "
+            "Please install eSpeak NG from: https://github.com/espeak-ng/espeak-ng/releases"
+        )
+def _setup_linux_espeak():
+    """Setup eSpeak for Linux."""
+    search_patterns = [
+        "/usr/lib/x86_64-linux-gnu/libespeak-ng.so*",
+        "/usr/lib/x86_64-linux-gnu/libespeak.so*",
+        "/usr/lib/libespeak-ng.so*",
+        "/usr/lib64/libespeak-ng.so*",
+        "/usr/local/lib/libespeak-ng.so*",
+    ]
+    for pattern in search_patterns:
+        matches = glob.glob(pattern)
+        if matches:
+            EspeakWrapper.set_library(sorted(matches, key=len)[0])
+            return
+    raise RuntimeError(
+        "eSpeak NG library not found. Install with:\n"
+        "  Ubuntu/Debian: sudo apt-get install espeak-ng\n"
+        "  Fedora: sudo dnf install espeak-ng\n"
+        "  Arch: sudo pacman -S espeak-ng\n"
+        "See: https://github.com/pnnbao97/VieNeu-TTS/issues/5"
+    )
+def _setup_macos_espeak():
+    """Setup eSpeak for macOS."""
+    espeak_lib = os.environ.get('PHONEMIZER_ESPEAK_LIBRARY')
+    paths_to_check = [
+        espeak_lib,
+        "/opt/homebrew/lib/libespeak-ng.dylib",  # Apple Silicon
+        "/usr/local/lib/libespeak-ng.dylib",     # Intel
+        "/opt/local/lib/libespeak-ng.dylib",     # MacPorts
+    ]
+    for path in paths_to_check:
+        if path and os.path.exists(path):
+            EspeakWrapper.set_library(path)
+            return
+    raise FileNotFoundError(
+        "eSpeak library not found. Install with:\n"
+        "  brew install espeak-ng\n"
+        "Or set: export PHONEMIZER_ESPEAK_LIBRARY=/path/to/libespeak-ng.dylib"
+    )
+# Initialize
+try:
+    setup_espeak_library()
+    phoneme_dict = load_phoneme_dict()
+    normalizer = VietnameseTTSNormalizer()
+except Exception as e:
+    print(f"Initialization error: {e}")
+    raise
+def phonemize_text(text: str) -> str:
+    """Convert text to phonemes using phonemizer."""
+    text = normalizer.normalize(text)
+    return phonemize(
+        text,
+        language="vi",
+        backend="espeak",
+        preserve_punctuation=True,
+        with_stress=True,
+        language_switch="remove-flags"
+    )
+def phonemize_with_dict(text: str, phoneme_dict=phoneme_dict) -> str:
+    """Phonemize text with dictionary lookup."""
+    text = normalizer.normalize(text)
+    words = text.split()
+    result = []
+    for word in words:
+        if word in phoneme_dict:
+            phone_word = phoneme_dict[word]
+        else:
+            try:
+                phone_word = phonemize(
+                    word,
+                    language='vi',
+                    backend='espeak',
+                    preserve_punctuation=True,
+                    with_stress=True,
+                    language_switch='remove-flags'
+                )
+                if word.lower().startswith('r'):
+                    phone_word = 'ɹ' + phone_word[1:]
+                phoneme_dict[word] = phone_word
+            except Exception as e:
+                print(f"Warning: Could not phonemize '{word}': {e}")
+                phone_word = word
+        result.append(phone_word)
+    return ' '.join(result)

vieneu_tts/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from .vieneu_tts import VieNeuTTS
2	+
3	+ __all__ = ["VieNeuTTS"]
4	+

vieneu_tts/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (207 Bytes). View file

vieneu_tts/__pycache__/vieneu_tts.cpython-312.pyc ADDED Viewed

Binary file (17 kB). View file

vieneu_tts/vieneu_tts.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from pathlib import Path
+from typing import Generator
+import librosa
+import numpy as np
+import torch
+from neucodec import NeuCodec, DistillNeuCodec
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from utils.phonemize_text import phonemize_with_dict
+import re
+def _linear_overlap_add(frames: list[np.ndarray], stride: int) -> np.ndarray:
+    # original impl --> https://github.com/facebookresearch/encodec/blob/main/encodec/utils.py
+    assert len(frames)
+    dtype = frames[0].dtype
+    shape = frames[0].shape[:-1]
+    total_size = 0
+    for i, frame in enumerate(frames):
+        frame_end = stride * i + frame.shape[-1]
+        total_size = max(total_size, frame_end)
+    sum_weight = np.zeros(total_size, dtype=dtype)
+    out = np.zeros(*shape, total_size, dtype=dtype)
+    offset: int = 0
+    for frame in frames:
+        frame_length = frame.shape[-1]
+        t = np.linspace(0, 1, frame_length + 2, dtype=dtype)[1:-1]
+        weight = np.abs(0.5 - (t - 0.5))
+        out[..., offset : offset + frame_length] += weight * frame
+        sum_weight[offset : offset + frame_length] += weight
+        offset += stride
+    assert sum_weight.min() > 0
+    return out / sum_weight
+class VieNeuTTS:
+    def __init__(
+        self,
+        backbone_repo="pnnbao-ump/VieNeu-TTS",
+        backbone_device="cpu",
+        codec_repo="neuphonic/neucodec",
+        codec_device="cpu",
+    ):
+        # Constants
+        self.sample_rate = 24_000
+        self.max_context = 2048
+        self.hop_length = 480
+        self.streaming_overlap_frames = 1
+        self.streaming_frames_per_chunk = 25
+        self.streaming_lookforward = 5
+        self.streaming_lookback = 50
+        self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
+        # ggml & onnx flags
+        self._is_quantized_model = False
+        self._is_onnx_codec = False
+        # HF tokenizer
+        self.tokenizer = None
+        # Load models
+        self._load_backbone(backbone_repo, backbone_device)
+        self._load_codec(codec_repo, codec_device)
+    def _load_backbone(self, backbone_repo, backbone_device):
+        print(f"Loading backbone from: {backbone_repo} on {backbone_device} ...")
+        if backbone_repo.lower().endswith("gguf") or "gguf" in backbone_repo.lower():
+            try:
+                from llama_cpp import Llama
+            except ImportError as e:
+                raise ImportError(
+                    "Failed to import `llama_cpp`. "
+                    "Please install it with:\n"
+                    "    pip install llama-cpp-python"
+                ) from e
+            self.backbone = Llama.from_pretrained(
+                repo_id=backbone_repo,
+                filename="*.gguf",
+                verbose=False,
+                n_gpu_layers=-1 if backbone_device == "gpu" else 0,
+                n_ctx=self.max_context,
+                mlock=True,
+                flash_attn=True if backbone_device == "gpu" else False,
+            )
+            self._is_quantized_model = True
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
+            self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
+                torch.device(backbone_device)
+            )
+    def _load_codec(self, codec_repo, codec_device):
+        print(f"Loading codec from: {codec_repo} on {codec_device} ...")
+        match codec_repo:
+            case "neuphonic/neucodec":
+                self.codec = NeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/distill-neucodec":
+                self.codec = DistillNeuCodec.from_pretrained(codec_repo)
+                self.codec.eval().to(codec_device)
+            case "neuphonic/neucodec-onnx-decoder":
+                if codec_device != "cpu":
+                    raise ValueError("Onnx decoder only currently runs on CPU.")
+                try:
+                    from neucodec import NeuCodecOnnxDecoder
+                except ImportError as e:
+                    raise ImportError(
+                        "Failed to import the onnx decoder."
+                        "Ensure you have onnxruntime installed as well as neucodec >= 0.0.4."
+                    ) from e
+                self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
+                self._is_onnx_codec = True
+            case _:
+                raise ValueError(f"Unsupported codec repository: {codec_repo}")
+    def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
+        """
+        Perform inference to generate speech from text using the TTS model and reference audio.
+        Args:
+            text (str): Input text to be converted to speech.
+            ref_codes (np.ndarray | torch.tensor): Encoded reference.
+            ref_text (str): Reference text for reference audio. Defaults to None.
+        Returns:
+            np.ndarray: Generated speech waveform.
+        """
+        # Generate tokens
+        if self._is_quantized_model:
+            output_str = self._infer_ggml(ref_codes, ref_text, text)
+        else:
+            prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
+            output_str = self._infer_torch(prompt_ids)
+        # Decode
+        wav = self._decode(output_str)
+        return wav
+    def infer_stream(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> Generator[np.ndarray, None, None]:
+        """
+        Perform streaming inference to generate speech from text using the TTS model and reference audio.
+        Args:
+            text (str): Input text to be converted to speech.
+            ref_codes (np.ndarray | torch.tensor): Encoded reference.
+            ref_text (str): Reference text for reference audio. Defaults to None.
+        Yields:
+            np.ndarray: Generated speech waveform.
+        """
+        if self._is_quantized_model:
+            return self._infer_stream_ggml(ref_codes, ref_text, text)
+        else:
+            raise NotImplementedError("Streaming is not implemented for the torch backend!")
+    def encode_reference(self, ref_audio_path: str | Path):
+        wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+        wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0)  # [1, 1, T]
+        with torch.no_grad():
+            ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
+        return ref_codes
+    def _decode(self, codes: str):
+        """Decode speech tokens to audio waveform."""
+        # Extract speech token IDs using regex
+        speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
+        if len(speech_ids) == 0:
+            raise ValueError(
+                "No valid speech tokens found in the output. "
+                "The model may not have generated proper speech tokens."
+            )
+        # Onnx decode
+        if self._is_onnx_codec:
+            codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
+            recon = self.codec.decode_code(codes)
+        # Torch decode
+        else:
+            with torch.no_grad():
+                codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
+                    self.codec.device
+                )
+                recon = self.codec.decode_code(codes).cpu().numpy()
+        return recon[0, 0, :]
+    def _apply_chat_template(self, ref_codes: list[int], ref_text: str, input_text: str) -> list[int]:
+        input_text = phonemize_with_dict(ref_text) + " " + phonemize_with_dict(input_text)
+        speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
+        speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
+        text_replace = self.tokenizer.convert_tokens_to_ids("<|TEXT_REPLACE|>")
+        text_prompt_start = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_START|>")
+        text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
+        input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
+        chat = """user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"""
+        ids = self.tokenizer.encode(chat)
+        text_replace_idx = ids.index(text_replace)
+        ids = (
+            ids[:text_replace_idx]
+            + [text_prompt_start]
+            + input_ids
+            + [text_prompt_end]
+            + ids[text_replace_idx + 1 :]  # noqa
+        )
+        speech_replace_idx = ids.index(speech_replace)
+        codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
+        codes = self.tokenizer.encode(codes_str, add_special_tokens=False)
+        ids = ids[:speech_replace_idx] + [speech_gen_start] + list(codes)
+        return ids
+    def _infer_torch(self, prompt_ids: list[int]) -> str:
+        prompt_tensor = torch.tensor(prompt_ids).unsqueeze(0).to(self.backbone.device)
+        speech_end_id = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
+        with torch.no_grad():
+            output_tokens = self.backbone.generate(
+                prompt_tensor,
+                max_length=self.max_context,
+                eos_token_id=speech_end_id,
+                do_sample=True,
+                temperature=1.0,
+                top_k=50,
+                use_cache=True,
+                min_new_tokens=50,
+            )
+        input_length = prompt_tensor.shape[-1]
+        output_str = self.tokenizer.decode(
+            output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
+        )
+        return output_str
+    def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
+        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+        prompt = (
+            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
+            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+        )
+        output = self.backbone(
+            prompt,
+            max_tokens=self.max_context,
+            temperature=1.0,
+            top_k=50,
+            stop=["<|SPEECH_GENERATION_END|>"],
+        )
+        output_str = output["choices"][0]["text"]
+        return output_str
+    def _infer_stream_ggml(self, ref_codes: torch.Tensor, ref_text: str, input_text: str) -> Generator[np.ndarray, None, None]:
+        ref_text = phonemize_with_dict(ref_text)
+        input_text = phonemize_with_dict(input_text)
+        codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
+        prompt = (
+            f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
+            f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
+        )
+        audio_cache: list[np.ndarray] = []
+        token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
+        n_decoded_samples: int = 0
+        n_decoded_tokens: int = len(ref_codes)
+        for item in self.backbone(
+            prompt,
+            max_tokens=self.max_context,
+            temperature=1.0,
+            top_k=50,
+            stop=["<|SPEECH_GENERATION_END|>"],
+            stream=True
+        ):
+            output_str = item["choices"][0]["text"]
+            token_cache.append(output_str)
+            if len(token_cache[n_decoded_tokens:]) >= self.streaming_frames_per_chunk + self.streaming_lookforward:
+                # decode chunk
+                tokens_start = max(
+                    n_decoded_tokens
+                    - self.streaming_lookback
+                    - self.streaming_overlap_frames,
+                    0
+                )
+                tokens_end = (
+                    n_decoded_tokens
+                    + self.streaming_frames_per_chunk
+                    + self.streaming_lookforward
+                    + self.streaming_overlap_frames
+                )
+                sample_start = (
+                    n_decoded_tokens - tokens_start
+                ) * self.hop_length
+                sample_end = (
+                    sample_start
+                    + (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames) * self.hop_length
+                )
+                curr_codes = token_cache[tokens_start:tokens_end]
+                recon = self._decode("".join(curr_codes))
+                recon = recon[sample_start:sample_end]
+                audio_cache.append(recon)
+                # postprocess
+                processed_recon = _linear_overlap_add(
+                    audio_cache, stride=self.streaming_stride_samples
+                )
+                new_samples_end = len(audio_cache) * self.streaming_stride_samples
+                processed_recon = processed_recon[
+                    n_decoded_samples:new_samples_end
+                ]
+                n_decoded_samples = new_samples_end
+                n_decoded_tokens += self.streaming_frames_per_chunk
+                yield processed_recon
+        # final decoding handled separately as non-constant chunk size
+        remaining_tokens = len(token_cache) - n_decoded_tokens
+        if len(token_cache) > n_decoded_tokens:
+            tokens_start = max(
+                len(token_cache)
+                - (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
+                0
+            )
+            sample_start = (
+                len(token_cache)
+                - tokens_start
+                - remaining_tokens
+                - self.streaming_overlap_frames
+            ) * self.hop_length
+            curr_codes = token_cache[tokens_start:]
+            recon = self._decode("".join(curr_codes))
+            recon = recon[sample_start:]
+            audio_cache.append(recon)
+            processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
+            processed_recon = processed_recon[n_decoded_samples:]
+            yield processed_recon