Spaces:

LTTEAM
/

TTS-82M

Paused

App Files Files Community

LTTEAM commited on Jun 13, 2025

Commit

a2462e9

verified ·

1 Parent(s): df31591

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -149

app.py CHANGED Viewed

@@ -1,34 +1,30 @@
-import spaces
-from kokoro import KModel, KPipeline
-import gradio as gr
 import os
 import random
-import torch
 import re
 import numpy as np
-# Nếu chạy dưới namespace của LTTEAM thì cho phép không giới hạn ký tự
-IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('LTTEAM/')
-CUDA_AVAILABLE = torch.cuda.is_available()
-CHAR_LIMIT = None  # bỏ hoàn toàn giới hạn
-if not IS_DUPLICATE:
-    import kokoro
-    import misaki
-    print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)
-# Khởi tạo model trên CPU/GPU
 models = {
-    gpu: KModel().to('cuda' if gpu else 'cpu').eval()
-    for gpu in [False] + ([True] if CUDA_AVAILABLE else [])
 }
-# Chuẩn bị pipeline cho hai nhóm ngữ âm 'a' và 'b'
-pipelines = {lang: KPipeline(lang_code=lang, model=False) for lang in 'ab'}
 pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
 pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
-# Load trước tất cả voice choices sau này
-CHOICES = {
 '🇺🇸 👩 Heart ❤️ (Mỹ)':    'af_heart',
 '🇺🇸 👩 Bella 🔥 (Mỹ)':    'af_bella',
 '🇺🇸 👩 Nicole 🎧 (Mỹ)':   'af_nicole',
@@ -61,174 +57,140 @@ CHOICES = {
 '🇬🇧 👨 Lewis (Anh)':       'bm_lewis',
 '🇬🇧 👨 Daniel (Anh)':      'bm_daniel',
 }
-for v in CHOICES.values():
-    pipelines[v[0]].load_voice(v)
-# Hàm chia văn bản thành các chunk nhỏ ~max_chars ký tự
 def split_into_chunks(text, max_chars=2000):
-    sentences = re.split(r'(?<=[\.!\?])\s+', text)
-    chunks, current = [], ""
-    for sent in sentences:
-        if len(current) + len(sent) + 1 <= max_chars:
-            current = current + (" " if current else "") + sent
         else:
-            if current:
-                chunks.append(current)
-            current = sent
-    if current:
-        chunks.append(current)
     return chunks
-# Wrapper để chạy trên GPU với decorator của Spaces
 @spaces.GPU(duration=30)
 def forward_gpu(ps, ref_s, speed):
     return models[True](ps, ref_s, speed)
-# Tạo toàn bộ audio nối từ nhiều chunk
-def generate_unlimited(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE, max_chars=2000):
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
-    use_gpu = use_gpu and CUDA_AVAILABLE
-    chunks = split_into_chunks(text, max_chars=max_chars)
-    all_audio = []
     for chunk in chunks:
         for _, ps, _ in pipeline(chunk, voice, speed):
             ref_s = pack[len(ps)-1]
             try:
-                audio = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
-            except gr.exceptions.Error as e:
                 if use_gpu:
-                    gr.Warning(str(e))
-                    gr.Info('Chuyển sang CPU cho chunk này.')
-                    audio = models[False](ps, ref_s, speed)
                 else:
-                    raise gr.Error(e)
-            all_audio.append(audio.numpy())
-        # Thêm 0.2s im lặng giữa các chunk
-        all_audio.append(np.zeros(int(0.2 * 24000)))
-    # Ghép nối và trả về
-    full_audio = np.concatenate(all_audio, axis=0)
-    return (24000, full_audio)
-# Phiên bản streaming: yield từng segment
-def generate_unlimited_stream(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE, max_chars=2000):
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
-    use_gpu = use_gpu and CUDA_AVAILABLE
-    chunks = split_into_chunks(text, max_chars=max_chars)
-    first = True
-    for chunk in chunks:
         for _, ps, _ in pipeline(chunk, voice, speed):
             ref_s = pack[len(ps)-1]
             try:
-                audio = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
-            except gr.exceptions.Error as e:
                 if use_gpu:
-                    gr.Warning(str(e))
-                    gr.Info('Chuyển sang CPU cho chunk này.')
-                    audio = models[False](ps, ref_s, speed)
                 else:
-                    raise gr.Error(e)
-            yield 24000, audio.numpy()
-        # Giữa các chunk, gửi 0.2s im lặng để tránh dính âm
         yield 24000, np.zeros(int(0.2 * 24000))
-# Arena API (không giới hạn)
-def predict(text, voice='af_heart', speed=1):
-    return generate_unlimited(text, voice, speed, use_gpu=False)
-def tokenize_first(text, voice='af_heart'):
-    pipeline = pipelines[voice[0]]
-    for _, ps, _ in pipeline(text, voice):
-        return ps
-    return ''
-# Các nút lấy văn bản mẫu
-with open('en.txt', 'r') as r:
-    random_quotes = [line.strip() for line in r]
-def get_random_quote():
-    return random.choice(random_quotes)
-def get_gatsby():
-    return open('gatsby5k.md', 'r').read().strip()
-def get_frankenstein():
-    return open('frankenstein5k.md', 'r').read().strip()
-# Giao diện Gradio
-TOKEN_NOTE = '''
-💡 Tùy chỉnh cách phát âm bằng cú pháp liên kết Markdown và gạch chéo...
-⬇️ Hạ mức độ...
-⬆️ Nâng mức độ...
-'''
-STREAM_NOTE = '''
-⚠️ Có một lỗi Gradio có thể khiến âm thanh không phát khi lần đầu Direct.
-🚀 Nhập không giới hạn, đã chia chunk tự động.
-'''
-BANNER_TEXT = '''
-[***TTS-82M*** **là mô hình TTS 82M tham số mở bởi LTTEAM.**](https://www.facebook.com/groups/622526090937760)
-'''
-API_OPEN = os.getenv('SPACE_ID') != 'LTTEAM/TTS-82M'
-API_NAME = None if API_OPEN else False
-with gr.Blocks() as app:
-    gr.Markdown(BANNER_TEXT)
     with gr.Row():
-        with gr.Column():
-            text = gr.Textbox(label='Văn bản đầu vào', info="Không giới hạn độ dài, sẽ tự chia nhỏ")
             with gr.Row():
-                voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Giọng')
-                use_gpu = gr.Dropdown(
-                    [('GPU (Nhanh)', True), ('CPU (Chậm)', False)],
-                    value=CUDA_AVAILABLE,
-                    label='Phần cứng',
-                    interactive=CUDA_AVAILABLE
                 )
-            speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Tốc độ')
-            random_btn = gr.Button('🎲 Trích dẫn ngẫu nhiên', variant='secondary')
-            gatsby_btn = gr.Button('🎲 Văn bản dài (Gatsby)', variant='secondary')
-            frankenstein_btn = gr.Button('🎲 Văn bản dài (Frankenstein)', variant='secondary')
-        with gr.Column():
-            out_audio = gr.Audio(label='Đầu ra âm thanh', interactive=False, streaming=False, autoplay=True)
-            out_ps = gr.Textbox(label='Tokens đầu ra', interactive=False, show_label=False)
-            generate_btn = gr.Button('Chuyển đổi', variant='primary')
-    # Streaming tab
-    with gr.Blocks():
-        out_stream = gr.Audio(label='Đầu ra Streaming', interactive=False, streaming=True, autoplay=True)
-        stream_btn = gr.Button('Trực Tiếp', variant='primary')
-        stop_btn = gr.Button('Dừng', variant='stop')
-        gr.Markdown(STREAM_NOTE)
-    # Sự kiện
-    random_btn.click(fn=get_random_quote, inputs=[], outputs=[text], api_name=API_NAME)
-    gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text], api_name=API_NAME)
-    frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text], api_name=API_NAME)
-    generate_btn.click(
-        fn=generate_unlimited,
-        inputs=[text, voice, speed, use_gpu],
         outputs=[out_audio]
     )
-    # Nếu muốn thêm hiển thị tokens lần đầu:
-    # generate_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
-    stream_event = stream_btn.click(
-        fn=generate_unlimited_stream,
-        inputs=[text, voice, speed, use_gpu],
         outputs=[out_stream]
     )
-    stop_btn.click(fn=None, cancels=stream_event)
-if __name__ == '__main__':
-    app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)

 import os
 import random
 import re
+import torch
 import numpy as np
+import spaces
+from kokoro import KModel, KPipeline
+import gradio as gr
+# --- Cấu hình cơ bản ---
+IS_LTTEAM = os.getenv('SPACE_ID', '').startswith('LTTEAM/')
+CUDA = torch.cuda.is_available()
+CHAR_LIMIT = None  # Không giới hạn ký tự
+# Khởi tạo model CPU/GPU
 models = {
+    use_gpu: KModel().to('cuda' if use_gpu else 'cpu').eval()
+    for use_gpu in ([False] + ([True] if CUDA else []))
 }
+# Chuẩn bị pipeline cho phoneme groups 'a' và 'b'
+pipelines = {g: KPipeline(lang_code=g, model=False) for g in ['a', 'b']}
 pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
 pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
+# Đăng ký toàn bộ giọng đọc
+VOICE_CHOICES = {
 '🇺🇸 👩 Heart ❤️ (Mỹ)':    'af_heart',
 '🇺🇸 👩 Bella 🔥 (Mỹ)':    'af_bella',
 '🇺🇸 👩 Nicole 🎧 (Mỹ)':   'af_nicole',
 '🇬🇧 👨 Lewis (Anh)':       'bm_lewis',
 '🇬🇧 👨 Daniel (Anh)':      'bm_daniel',
 }
+for group in VOICE_CHOICES.values():
+    for _, voice_id in group:
+        pipelines[voice_id[0]].load_voice(voice_id)
+# --- Hàm tiện ích ---
 def split_into_chunks(text, max_chars=2000):
+    """Chia văn bản thành các đoạn không vượt quá max_chars."""
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks, cur = [], ""
+    for s in sentences:
+        if len(cur) + len(s) + 1 <= max_chars:
+            cur = f"{cur} {s}".strip()
         else:
+            chunks.append(cur)
+            cur = s
+    if cur:
+        chunks.append(cur)
     return chunks
 @spaces.GPU(duration=30)
 def forward_gpu(ps, ref_s, speed):
     return models[True](ps, ref_s, speed)
+def generate_audio(text, voice, speed, use_gpu):
+    """Tạo file audio ghép từ nhiều chunk."""
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
+    use_gpu = use_gpu and CUDA
+    chunks = split_into_chunks(text)
+    audios = []
     for chunk in chunks:
         for _, ps, _ in pipeline(chunk, voice, speed):
             ref_s = pack[len(ps)-1]
             try:
+                out = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
+            except Exception as e:
                 if use_gpu:
+                    gr.Warning(f"Lỗi GPU, chuyển sang CPU: {e}")
+                    out = models[False](ps, ref_s, speed)
                 else:
+                    raise
+            audios.append(out.numpy())
+        # Thêm 0.2s im lặng giữa đoạn
+        audios.append(np.zeros(int(0.2 * 24000)))
+    return (24000, np.concatenate(audios, axis=0))
+def generate_stream(text, voice, speed, use_gpu):
+    """Phát trực tiếp từng đoạn audio."""
     text = text.strip()
     pipeline = pipelines[voice[0]]
     pack = pipeline.load_voice(voice)
+    use_gpu = use_gpu and CUDA
+    for chunk in split_into_chunks(text):
         for _, ps, _ in pipeline(chunk, voice, speed):
             ref_s = pack[len(ps)-1]
             try:
+                out = forward_gpu(ps, ref_s, speed) if use_gpu else models[False](ps, ref_s, speed)
+            except Exception as e:
                 if use_gpu:
+                    gr.Warning(f"Lỗi GPU, chuyển sang CPU: {e}")
+                    out = models[False](ps, ref_s, speed)
                 else:
+                    raise
+            yield 24000, out.numpy()
         yield 24000, np.zeros(int(0.2 * 24000))
+# --- Tải ví dụ văn bản ---
+with open('en.txt', 'r') as f:
+    QUOTES = [l.strip() for l in f]
+def random_quote(): return random.choice(QUOTES)
+def load_gatsby(): return open('gatsby5k.md','r').read().strip()
+def load_frank(): return open('frankenstein5k.md','r').read().strip()
+# --- Xây dựng giao diện Gradio ---
+with gr.Blocks(css="""
+    .block { max-width: 900px; margin: auto; }
+    .voice-group { margin-bottom: 10px; }
+""") as app:
+    gr.Markdown("""
+**TTS-82M** — Mô hình chuyển văn bản thành giọng nói 82M tham số, mở bởi LTTEAM.
+""")
     with gr.Row():
+        with gr.Column(scale=2):
+            inp_txt = gr.Textbox(label="Văn bản đầu vào", placeholder="Nhập hoặc dán văn bản...", lines=4)
+            with gr.Accordion("Ví dụ nhanh", open=False):
+                gr.Button("📜 Trích dẫn ngẫu nhiên").click(random_quote, [], inp_txt)
+                gr.Button("📗 Gatsby dài").click(load_gatsby, [], inp_txt)
+                gr.Button("📕 Frankenstein").click(load_frank, [], inp_txt)
             with gr.Row():
+                voice_dd = gr.Dropdown(
+                    label="Chọn giọng đọc",
+                    choices=[(f"{grp} — {name}", vid)
+                             for grp, lst in VOICE_CHOICES.items()
+                             for name, vid in lst],
+                    value='af_heart'
+                )
+                hw_dd = gr.Radio(
+                    ["GPU (Nhanh)", "CPU (Chậm)"],
+                    label="Phần cứng",
+                    value="GPU (Nhanh)" if CUDA else "CPU (Chậm)"
                 )
+            speed_sl = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Tốc độ phát")
+        with gr.Column(scale=1):
+            out_audio = gr.Audio(label="Kết quả âm thanh", interactive=False, streaming=False, autoplay=True)
+            btn_gen = gr.Button("▶️ Tạo âm thanh", variant="primary")
+    with gr.Tabs():
+        with gr.TabItem("Phát trực tiếp"):
+            out_stream = gr.Audio(label="Phát trực tiếp", interactive=False, streaming=True, autoplay=True)
+            btn_stream = gr.Button("🔴 Bắt đầu", variant="primary")
+            btn_stop = gr.Button("⏹ Dừng", variant="stop")
+    # Liên kết sự kiện
+    btn_gen.click(
+        fn=generate_audio,
+        inputs=[inp_txt, voice_dd, speed_sl, hw_dd.map({"GPU (Nhanh)": True, "CPU (Chậm)": False})],
         outputs=[out_audio]
     )
+    btn_stream.click(
+        fn=generate_stream,
+        inputs=[inp_txt, voice_dd, speed_sl, hw_dd.map({"GPU (Nhanh)": True, "CPU (Chậm)": False})],
         outputs=[out_stream]
+    ).then(
+        None, [], [], cancels=btn_stream
     )
+if __name__ == "__main__":
+    app.queue(api_open=not IS_LTTEAM).launch(show_api=not IS_LTTEAM, ssr_mode=True)