Spaces:

LTTEAM
/

TTS-82M

Paused

App Files Files Community

LTTEAM commited on Jun 13, 2025

Commit

8bb312c

verified ·

1 Parent(s): a2462e9

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -109

app.py CHANGED Viewed

@@ -1,30 +1,31 @@
 import os
 import random
 import re
-import torch
 import numpy as np
 import spaces
 from kokoro import KModel, KPipeline
 import gradio as gr
-# --- Cấu hình cơ bản ---
 IS_LTTEAM = os.getenv('SPACE_ID', '').startswith('LTTEAM/')
-CUDA = torch.cuda.is_available()
 CHAR_LIMIT = None  # Không giới hạn ký tự
-# Khởi tạo model CPU/GPU
 models = {
     use_gpu: KModel().to('cuda' if use_gpu else 'cpu').eval()
-    for use_gpu in ([False] + ([True] if CUDA else []))
 }
-# Chuẩn bị pipeline cho phoneme groups 'a' và 'b'
-pipelines = {g: KPipeline(lang_code=g, model=False) for g in ['a', 'b']}
 pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
 pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
-# Đăng ký toàn bộ giọng đọc
-VOICE_CHOICES = {
 '🇺🇸 👩 Heart ❤️ (Mỹ)':    'af_heart',
 '🇺🇸 👩 Bella 🔥 (Mỹ)':    'af_bella',
 '🇺🇸 👩 Nicole 🎧 (Mỹ)':   'af_nicole',
@@ -57,140 +58,151 @@ VOICE_CHOICES = {
 '🇬🇧 👨 Lewis (Anh)':       'bm_lewis',
 '🇬🇧 👨 Daniel (Anh)':      'bm_daniel',
 }
-for group in VOICE_CHOICES.values():
-    for _, voice_id in group:
-        pipelines[voice_id[0]].load_voice(voice_id)
 # --- Hàm tiện ích ---
 def split_into_chunks(text, max_chars=2000):
-    """Chia văn bản thành các đoạn không vượt quá max_chars."""
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks, cur = [], ""
     for s in sentences:
-        if len(cur) + len(s) + 1 <= max_chars:
-            cur = f"{cur} {s}".strip()
         else:
-            chunks.append(cur)
-            cur = s
-    if cur:
-        chunks.append(cur)
     return chunks
 @spaces.GPU(duration=30)
 def forward_gpu(ps, ref_s, speed):
     return models[True](ps, ref_s, speed)
-def generate_audio(text, voice, speed, use_gpu):
-    """Tạo file audio ghép từ nhiều chunk."""
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
-    use_gpu = use_gpu and CUDA
-    chunks = split_into_chunks(text)
-    audios = []
-    for chunk in chunks:
         for _, ps, _ in pipeline(chunk, voice, speed):
-            ref_s = pack[len(ps)-1]
             try:
-                out = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
-            except Exception as e:
                 if use_gpu:
-                    gr.Warning(f"Lỗi GPU, chuyển sang CPU: {e}")
-                    out = models[False](ps, ref_s, speed)
                 else:
                     raise
-            audios.append(out.numpy())
-        # Thêm 0.2s im lặng giữa đoạn
-        audios.append(np.zeros(int(0.2 * 24000)))
-    return (24000, np.concatenate(audios, axis=0))
-def generate_stream(text, voice, speed, use_gpu):
-    """Phát trực tiếp từng đoạn audio."""
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
-    use_gpu = use_gpu and CUDA
-    for chunk in split_into_chunks(text):
         for _, ps, _ in pipeline(chunk, voice, speed):
-            ref_s = pack[len(ps)-1]
             try:
-                out = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
-            except Exception as e:
                 if use_gpu:
-                    gr.Warning(f"Lỗi GPU, chuyển sang CPU: {e}")
-                    out = models[False](ps, ref_s, speed)
                 else:
                     raise
-            yield 24000, out.numpy()
         yield 24000, np.zeros(int(0.2 * 24000))
-# --- Tải ví dụ văn bản ---
 with open('en.txt', 'r') as f:
-    QUOTES = [l.strip() for l in f]
-def random_quote(): return random.choice(QUOTES)
-def load_gatsby(): return open('gatsby5k.md','r').read().strip()
-def load_frank(): return open('frankenstein5k.md','r').read().strip()
-# --- Xây dựng giao diện Gradio ---
-with gr.Blocks(css="""
-    .block { max-width: 900px; margin: auto; }
-    .voice-group { margin-bottom: 10px; }
-""") as app:
-    gr.Markdown("""
-**TTS-82M** — Mô hình chuyển văn bản thành giọng nói 82M tham số, mở bởi LTTEAM.
-""")
-    with gr.Row():
-        with gr.Column(scale=2):
-            inp_txt = gr.Textbox(label="Văn bản đầu vào", placeholder="Nhập hoặc dán văn bản...", lines=4)
-            with gr.Accordion("Ví dụ nhanh", open=False):
-                gr.Button("📜 Trích dẫn ngẫu nhiên").click(random_quote, [], inp_txt)
-                gr.Button("📗 Gatsby dài").click(load_gatsby, [], inp_txt)
-                gr.Button("📕 Frankenstein").click(load_frank, [], inp_txt)
-            with gr.Row():
-                voice_dd = gr.Dropdown(
-                    label="Chọn giọng đọc",
-                    choices=[(f"{grp} — {name}", vid)
-                             for grp, lst in VOICE_CHOICES.items()
-                             for name, vid in lst],
-                    value='af_heart'
-                )
-                hw_dd = gr.Radio(
-                    ["GPU (Nhanh)", "CPU (Chậm)"],
-                    label="Phần cứng",
-                    value="GPU (Nhanh)" if CUDA else "CPU (Chậm)"
-                )
-            speed_sl = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Tốc độ phát")
-        with gr.Column(scale=1):
-            out_audio = gr.Audio(label="Kết quả âm thanh", interactive=False, streaming=False, autoplay=True)
-            btn_gen = gr.Button("▶️ Tạo âm thanh", variant="primary")
     with gr.Tabs():
-        with gr.TabItem("Phát trực tiếp"):
-            out_stream = gr.Audio(label="Phát trực tiếp", interactive=False, streaming=True, autoplay=True)
-            btn_stream = gr.Button("🔴 Bắt đầu", variant="primary")
-            btn_stop = gr.Button("⏹ Dừng", variant="stop")
-    # Liên kết sự kiện
-    btn_gen.click(
-        fn=generate_audio,
-        inputs=[inp_txt, voice_dd, speed_sl, hw_dd.map({"GPU (Nhanh)": True, "CPU (Chậm)": False})],
-        outputs=[out_audio]
-    )
-    btn_stream.click(
-        fn=generate_stream,
-        inputs=[inp_txt, voice_dd, speed_sl, hw_dd.map({"GPU (Nhanh)": True, "CPU (Chậm)": False})],
-        outputs=[out_stream]
-    ).then(
-        None, [], [], cancels=btn_stream
     )
-if __name__ == "__main__":
-    app.queue(api_open=not IS_LTTEAM).launch(show_api=not IS_LTTEAM, ssr_mode=True)

 import os
 import random
 import re
 import numpy as np
+import torch
 import spaces
 from kokoro import KModel, KPipeline
 import gradio as gr
+# --- Cấu hình chung ---
+CUDA_AVAILABLE = torch.cuda.is_available()
 IS_LTTEAM = os.getenv('SPACE_ID', '').startswith('LTTEAM/')
 CHAR_LIMIT = None  # Không giới hạn ký tự
+# Khởi tạo mô hình trên CPU/GPU
 models = {
     use_gpu: KModel().to('cuda' if use_gpu else 'cpu').eval()
+    for use_gpu in ( [False, True] if CUDA_AVAILABLE else [False] )
 }
+# Chuẩn bị pipelines cho ký tự ngữ âm 'a' và 'b'
+pipelines = {lang: KPipeline(lang_code=lang, model=False) for lang in ('a', 'b')}
+# Ví dụ thêm lexicon tùy chỉnh
 pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
 pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
+# Danh sách giọng nói (cờ + biểu tượng + tên) -> mã nội bộ
+LUA_CHON_GIONG = {
 '🇺🇸 👩 Heart ❤️ (Mỹ)':    'af_heart',
 '🇺🇸 👩 Bella 🔥 (Mỹ)':    'af_bella',
 '🇺🇸 👩 Nicole 🎧 (Mỹ)':   'af_nicole',
 '🇬🇧 👨 Lewis (Anh)':       'bm_lewis',
 '🇬🇧 👨 Daniel (Anh)':      'bm_daniel',
 }
+# Tải trước tất cả giọng
+for voice_code in LUA_CHON_GIONG.values():
+    pipelines[voice_code[0]].load_voice(voice_code)
 # --- Hàm tiện ích ---
 def split_into_chunks(text, max_chars=2000):
+    """Chia văn bản thành các khúc nhỏ không vượt quá max_chars."""
+    sentences = re.split(r'(?<=[\.!\?])\s+', text.strip())
+    chunks, current = [], ""
     for s in sentences:
+        if len(current) + len(s) + 1 <= max_chars:
+            current = f"{current} {s}".strip()
         else:
+            if current:
+                chunks.append(current)
+            current = s
+    if current:
+        chunks.append(current)
     return chunks
 @spaces.GPU(duration=30)
 def forward_gpu(ps, ref_s, speed):
     return models[True](ps, ref_s, speed)
+def generate_unlimited(text, voice, speed, use_gpu, max_chars=2000):
+    """Chế độ không giới hạn: chia chunk rồi ghép thanh âm."""
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
+    use_gpu = use_gpu and CUDA_AVAILABLE
+    all_audio = []
+    for chunk in split_into_chunks(text, max_chars):
         for _, ps, _ in pipeline(chunk, voice, speed):
+            ref_s = pack[len(ps) - 1]
             try:
+                audio = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
+            except gr.Error as e:
                 if use_gpu:
+                    gr.Warning(f"Lỗi GPU: {e}\nChuyển sang CPU cho khúc này.")
+                    audio = models[False](ps, ref_s, speed)
                 else:
                     raise
+            all_audio.append(audio.numpy())
+        # thêm 0.2s im lặng
+        all_audio.append(np.zeros(int(0.2 * 24000)))
+    return (24000, np.concatenate(all_audio, axis=0))
+def generate_stream(text, voice, speed, use_gpu, max_chars=2000):
+    """Chế độ streaming: yield từng đoạn audio nhỏ."""
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
+    use_gpu = use_gpu and CUDA_AVAILABLE
+    for chunk in split_into_chunks(text, max_chars):
         for _, ps, _ in pipeline(chunk, voice, speed):
+            ref_s = pack[len(ps) - 1]
             try:
+                audio = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
+            except gr.Error as e:
                 if use_gpu:
+                    gr.Warning(f"Lỗi GPU: {e}\nChuyển sang CPU cho khúc này.")
+                    audio = models[False](ps, ref_s, speed)
                 else:
                     raise
+            yield 24000, audio.numpy()
         yield 24000, np.zeros(int(0.2 * 24000))
+def tokenize_first(text, voice):
+    for _, ps, _ in pipelines[voice[0]](text, voice):
+        return ps
+    return ""
+# Các văn bản mẫu
 with open('en.txt', 'r') as f:
+    TRICH_DAN_NGAU_NHIEN = [l.strip() for l in f]
+def random_quote(): return random.choice(TRICH_DAN_NGAU_NHIEN)
+def load_gatsby(): return open('gatsby5k.md','r').read()
+def load_frank(): return open('frankenstein5k.md','r').read()
+# --- Giao diện Gradio ---
+BANNER = """
+# 📣 **TTS-82M**
+Mô hình TTS 82M tham số do LTTEAM mở.
+[Tham gia nhóm FB](https://www.facebook.com/groups/622526090937760)
+"""
+with gr.Blocks() as app:
+    gr.Markdown(BANNER)
     with gr.Tabs():
+        # Tab 1: Không giới hạn
+        with gr.TabItem("📝 TTS Không Giới Hạn"):
+            with gr.Row():
+                with gr.Column(scale=6):
+                    txt_in = gr.Textbox(label="Văn bản đầu vào", placeholder="Nhập hoặc dán văn bản...", lines=5)
+                    with gr.Row():
+                        dd_voice = gr.Dropdown(list(LUA_CHON_GIONG.items()), value='af_heart', label="Chọn Giọng")
+                        dd_hw = gr.Dropdown([('GPU (Nhanh)', True), ('CPU (Chậm)', False)],
+                                            value=CUDA_AVAILABLE, label="Thiết bị xử lý", interactive=CUDA_AVAILABLE)
+                    slider_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ phát âm")
+                    with gr.Row():
+                        btn_random = gr.Button("🎲 Trích ngẫu nhiên", variant='secondary')
+                        btn_gatsby = gr.Button("📖 Gatsby dài", variant='secondary')
+                        btn_frank = gr.Button("📖 Frankenstein dài", variant='secondary')
+                with gr.Column(scale=6):
+                    out_audio = gr.Audio(label="Kết quả âm thanh", interactive=False, autoplay=True)
+                    out_tokens = gr.Textbox(label="Tokens đầu ra", interactive=False)
+                    btn_generate = gr.Button("▶️ Chuyển đổi", variant='primary')
+        # Tab 2: Streaming
+        with gr.TabItem("🔴 TTS Streaming"):
+            with gr.Row():
+                with gr.Column(scale=6):
+                    txt_in2 = gr.Textbox(label="Văn bản đầu vào", placeholder="Nhập văn bản để phát trực tiếp...", lines=5)
+                    with gr.Row():
+                        dd_voice2 = gr.Dropdown(list(LUA_CHON_GIONG.items()), value='af_heart', label="Chọn Giọng")
+                        dd_hw2 = gr.Dropdown([('GPU (Nhanh)', True), ('CPU (Chậm)', False)],
+                                             value=CUDA_AVAILABLE, label="Thiết bị xử lý", interactive=CUDA_AVAILABLE)
+                    slider_speed2 = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ phát âm")
+                    btn_stream = gr.Button("🎙️ Bắt đầu Streaming", variant='primary')
+                    btn_stop = gr.Button("⏹️ Dừng lại", variant='stop')
+                with gr.Column(scale=6):
+                    out_stream = gr.Audio(label="Phát trực tiếp", streaming=True, autoplay=True)
+    # Sự kiện nút bấm Tab 1
+    btn_random.click(fn=random_quote, inputs=[], outputs=[txt_in])
+    btn_gatsby.click(fn=load_gatsby, inputs=[], outputs=[txt_in])
+    btn_frank.click(fn=load_frank, inputs=[], outputs=[txt_in])
+    btn_generate.click(fn=generate_unlimited,
+                       inputs=[txt_in, dd_voice, slider_speed, dd_hw],
+                       outputs=[out_audio])
+    # nếu cần hiện tokens: .click(fn=tokenize_first, inputs=[txt_in, dd_voice], outputs=[out_tokens])
+    # Sự kiện Tab 2
+    stream_event = btn_stream.click(fn=generate_stream,
+                                    inputs=[txt_in2, dd_voice2, slider_speed2, dd_hw2],
+                                    outputs=[out_stream])
+    btn_stop.click(fn=None, cancels=[stream_event])
+    # Khởi chạy
+    app.queue(api_open=not IS_LTTEAM).launch(
+        show_api=not IS_LTTEAM,
+        ssr_mode=True
     )