Spaces:

LTTEAM
/

TTS-82M

Sleeping

File size: 8,966 Bytes

6e0ee41
 
f38f765
 
8bb312c
a2462e9
 
 
a16a0e6
8bb312c
 
a2462e9
 
3960baa
8bb312c
b76f6b2
a2462e9
8bb312c
b76f6b2
a2462e9
8bb312c
 
 
3960baa
 
6550a56
8bb312c
 
df31591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f38f765
8bb312c
 
 
a2462e9
 
f38f765
8bb312c
 
 
a2462e9
8bb312c
 
f38f765
8bb312c
 
 
 
 
f38f765
 
6550a56
 
 
 
8bb312c
 
f38f765
3960baa
 
8bb312c
3960baa
8bb312c
 
b76f6b2
8bb312c
b76f6b2
8bb312c
 
b76f6b2
8bb312c
 
b76f6b2
a2462e9
8bb312c
 
 
 
a2462e9
8bb312c
 
6550a56
 
 
8bb312c
6550a56
8bb312c
6550a56
8bb312c
6550a56
8bb312c
 
6550a56
8bb312c
 
6550a56
a2462e9
8bb312c
6550a56
 
8bb312c
 
 
 
 
 
a2462e9
8bb312c
 
 
 
a2462e9
8bb312c
 
 
 
 
 
 
 
 
a2462e9
 
8bb312c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f38f765

import os
import random
import re
import numpy as np
import torch
import spaces
from kokoro import KModel, KPipeline
import gradio as gr

# --- Cấu hình chung ---
CUDA_AVAILABLE = torch.cuda.is_available()
IS_LTTEAM = os.getenv('SPACE_ID', '').startswith('LTTEAM/')
CHAR_LIMIT = None  # Không giới hạn ký tự

# Khởi tạo mô hình trên CPU/GPU
models = {
    use_gpu: KModel().to('cuda' if use_gpu else 'cpu').eval()
    for use_gpu in ( [False, True] if CUDA_AVAILABLE else [False] )
}

# Chuẩn bị pipelines cho ký tự ngữ âm 'a' và 'b'
pipelines = {lang: KPipeline(lang_code=lang, model=False) for lang in ('a', 'b')}
# Ví dụ thêm lexicon tùy chỉnh
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'

# Danh sách giọng nói (cờ + biểu tượng + tên) -> mã nội bộ
LUA_CHON_GIONG = {
'🇺🇸 👩 Heart ❤️ (Mỹ)':    'af_heart',
'🇺🇸 👩 Bella 🔥 (Mỹ)':    'af_bella',
'🇺🇸 👩 Nicole 🎧 (Mỹ)':   'af_nicole',
'🇺🇸 👩 Aoede (Mỹ)':       'af_aoede',
'🇺🇸 👩 Kore (Mỹ)':        'af_kore',
'🇺🇸 👩 Sarah (Mỹ)':       'af_sarah',
'🇺🇸 👩 Nova (Mỹ)':        'af_nova',
'🇺🇸 👩 Sky (Mỹ)':         'af_sky',
'🇺🇸 👩 Alloy (Mỹ)':       'af_alloy',
'🇺🇸 👩 Jessica (Mỹ)':     'af_jessica',
'🇺🇸 👩 River (Mỹ)':       'af_river',

'🇺🇸 👨 Michael (Mỹ)':     'am_michael',
'🇺🇸 👨 Fenrir (Mỹ)':      'am_fenrir',
'🇺🇸 👨 Puck (Mỹ)':        'am_puck',
'🇺🇸 👨 Echo (Mỹ)':        'am_echo',
'🇺🇸 👨 Eric (Mỹ)':        'am_eric',
'🇺🇸 👨 Liam (Mỹ)':        'am_liam',
'🇺🇸 👨 Onyx (Mỹ)':        'am_onyx',
'🇺🇸 👨 Santa (Mỹ)':       'am_santa',
'🇺🇸 👨 Adam (Mỹ)':        'am_adam',

'🇬🇧 👩 Emma (Anh)':        'bf_emma',
'🇬🇧 👩 Isabella (Anh)':    'bf_isabella',
'🇬🇧 👩 Alice (Anh)':       'bf_alice',
'🇬🇧 👩 Lily (Anh)':        'bf_lily',

'🇬🇧 👨 George (Anh)':      'bm_george',
'🇬🇧 👨 Fable (Anh)':       'bm_fable',
'🇬🇧 👨 Lewis (Anh)':       'bm_lewis',
'🇬🇧 👨 Daniel (Anh)':      'bm_daniel',
}
# Tải trước tất cả giọng
for voice_code in LUA_CHON_GIONG.values():
    pipelines[voice_code[0]].load_voice(voice_code)

# --- Hàm tiện ích ---
def split_into_chunks(text, max_chars=2000):
    """Chia văn bản thành các khúc nhỏ không vượt quá max_chars."""
    sentences = re.split(r'(?<=[\.!\?])\s+', text.strip())
    chunks, current = [], ""
    for s in sentences:
        if len(current) + len(s) + 1 <= max_chars:
            current = f"{current} {s}".strip()
        else:
            if current:
                chunks.append(current)
            current = s
    if current:
        chunks.append(current)
    return chunks

@spaces.GPU(duration=30)
def forward_gpu(ps, ref_s, speed):
    return models[True](ps, ref_s, speed)

def generate_unlimited(text, voice, speed, use_gpu, max_chars=2000):
    """Chế độ không giới hạn: chia chunk rồi ghép thanh âm."""
    text = text.strip()
    pipeline = pipelines[voice[0]]
    pack = pipeline.load_voice(voice)
    use_gpu = use_gpu and CUDA_AVAILABLE

    all_audio = []
    for chunk in split_into_chunks(text, max_chars):
        for _, ps, _ in pipeline(chunk, voice, speed):
            ref_s = pack[len(ps) - 1]
            try:
                audio = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
            except gr.Error as e:
                if use_gpu:
                    gr.Warning(f"Lỗi GPU: {e}\nChuyển sang CPU cho khúc này.")
                    audio = models[False](ps, ref_s, speed)
                else:
                    raise
            all_audio.append(audio.numpy())
        # thêm 0.2s im lặng
        all_audio.append(np.zeros(int(0.2 * 24000)))
    return (24000, np.concatenate(all_audio, axis=0))

def generate_stream(text, voice, speed, use_gpu, max_chars=2000):
    """Chế độ streaming: yield từng đoạn audio nhỏ."""
    text = text.strip()
    pipeline = pipelines[voice[0]]
    pack = pipeline.load_voice(voice)
    use_gpu = use_gpu and CUDA_AVAILABLE

    for chunk in split_into_chunks(text, max_chars):
        for _, ps, _ in pipeline(chunk, voice, speed):
            ref_s = pack[len(ps) - 1]
            try:
                audio = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
            except gr.Error as e:
                if use_gpu:
                    gr.Warning(f"Lỗi GPU: {e}\nChuyển sang CPU cho khúc này.")
                    audio = models[False](ps, ref_s, speed)
                else:
                    raise
            yield 24000, audio.numpy()
        yield 24000, np.zeros(int(0.2 * 24000))

def tokenize_first(text, voice):
    for _, ps, _ in pipelines[voice[0]](text, voice):
        return ps
    return ""

# Các văn bản mẫu
with open('en.txt', 'r') as f:
    TRICH_DAN_NGAU_NHIEN = [l.strip() for l in f]
def random_quote(): return random.choice(TRICH_DAN_NGAU_NHIEN)
def load_gatsby(): return open('gatsby5k.md','r').read()
def load_frank(): return open('frankenstein5k.md','r').read()

# --- Giao diện Gradio ---
BANNER = """
# 📣 **TTS-82M**  
Mô hình TTS 82M tham số do LTTEAM mở.  
[Tham gia nhóm FB](https://www.facebook.com/groups/622526090937760)
"""

with gr.Blocks() as app:
    gr.Markdown(BANNER)

    with gr.Tabs():
        # Tab 1: Không giới hạn
        with gr.TabItem("📝 TTS Không Giới Hạn"):
            with gr.Row():
                with gr.Column(scale=6):
                    txt_in = gr.Textbox(label="Văn bản đầu vào", placeholder="Nhập hoặc dán văn bản...", lines=5)
                    with gr.Row():
                        dd_voice = gr.Dropdown(list(LUA_CHON_GIONG.items()), value='af_heart', label="Chọn Giọng")
                        dd_hw = gr.Dropdown([('GPU (Nhanh)', True), ('CPU (Chậm)', False)],
                                            value=CUDA_AVAILABLE, label="Thiết bị xử lý", interactive=CUDA_AVAILABLE)
                    slider_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ phát âm")
                    with gr.Row():
                        btn_random = gr.Button("🎲 Trích ngẫu nhiên", variant='secondary')
                        btn_gatsby = gr.Button("📖 Gatsby dài", variant='secondary')
                        btn_frank = gr.Button("📖 Frankenstein dài", variant='secondary')

                with gr.Column(scale=6):
                    out_audio = gr.Audio(label="Kết quả âm thanh", interactive=False, autoplay=True)
                    out_tokens = gr.Textbox(label="Tokens đầu ra", interactive=False)
                    btn_generate = gr.Button("▶️ Chuyển đổi", variant='primary')

        # Tab 2: Streaming
        with gr.TabItem("🔴 TTS Streaming"):
            with gr.Row():
                with gr.Column(scale=6):
                    txt_in2 = gr.Textbox(label="Văn bản đầu vào", placeholder="Nhập văn bản để phát trực tiếp...", lines=5)
                    with gr.Row():
                        dd_voice2 = gr.Dropdown(list(LUA_CHON_GIONG.items()), value='af_heart', label="Chọn Giọng")
                        dd_hw2 = gr.Dropdown([('GPU (Nhanh)', True), ('CPU (Chậm)', False)],
                                             value=CUDA_AVAILABLE, label="Thiết bị xử lý", interactive=CUDA_AVAILABLE)
                    slider_speed2 = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ phát âm")
                    btn_stream = gr.Button("🎙️ Bắt đầu Streaming", variant='primary')
                    btn_stop = gr.Button("⏹️ Dừng lại", variant='stop')
                with gr.Column(scale=6):
                    out_stream = gr.Audio(label="Phát trực tiếp", streaming=True, autoplay=True)

    # Sự kiện nút bấm Tab 1
    btn_random.click(fn=random_quote, inputs=[], outputs=[txt_in])
    btn_gatsby.click(fn=load_gatsby, inputs=[], outputs=[txt_in])
    btn_frank.click(fn=load_frank, inputs=[], outputs=[txt_in])

    btn_generate.click(fn=generate_unlimited,
                       inputs=[txt_in, dd_voice, slider_speed, dd_hw],
                       outputs=[out_audio])
    # nếu cần hiện tokens: .click(fn=tokenize_first, inputs=[txt_in, dd_voice], outputs=[out_tokens])

    # Sự kiện Tab 2
    stream_event = btn_stream.click(fn=generate_stream,
                                    inputs=[txt_in2, dd_voice2, slider_speed2, dd_hw2],
                                    outputs=[out_stream])
    btn_stop.click(fn=None, cancels=[stream_event])

    # Khởi chạy
    app.queue(api_open=not IS_LTTEAM).launch(
        show_api=not IS_LTTEAM,
        ssr_mode=True
    )