0___0

Sleeping

App Files Files Community

Arxords commited on Feb 14

Commit

e4aaf99

verified ·

1 Parent(s): 6070859

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -87

app.py CHANGED Viewed

@@ -49,82 +49,70 @@ LANGUAGE_MAP = {
 # CHUNK TEXT - Chia văn bản thành các đoạn nhỏ theo câu/đoạn
 # ============================================================================
-def split_into_chunks(text: str, max_chars: int = 200) -> list:
     """
-    Chia văn bản thành các chunk theo câu/đoạn.
-    Ưu tiên chia ở dấu câu kết thúc câu, sau đó dấu phẩy, sau đó khoảng trắng.
-    Không giới hạn số lượng chunk.
     """
     text = text.strip()
     if not text:
         return []
-    if len(text) <= max_chars:
-        return [text]
-    chunks = []
-    paragraphs = re.split(r'\n\s*\n', text)
-    for para in paragraphs:
         para = para.strip()
         if not para:
             continue
-        if len(para) <= max_chars:
-            chunks.append(para)
-            continue
-        # Chia theo câu: dấu chấm, chấm hỏi, chấm than, dấu ba chấm
-        sentences = re.split(r'(?<=[.!?…。！？])\s+', para)
-        current = ""
-        for sent in sentences:
-            sent = sent.strip()
-            if not sent:
-                continue
-            # Nếu câu đơn lẻ đã vượt max_chars, chia nhỏ hơn theo dấu phẩy
-            if len(sent) > max_chars:
-                sub_parts = re.split(r'(?<=[,;])\s+', sent)
-                for part in sub_parts:
-                    part = part.strip()
-                    if not part:
-                        continue
-                    if len(current) + len(part) + 1 <= max_chars:
-                        current = (current + " " + part).strip()
-                    else:
-                        if current:
-                            chunks.append(current)
-                        # Nếu part vẫn quá dài, chia theo từ
-                        if len(part) > max_chars:
-                            words = part.split()
-                            current = ""
-                            for w in words:
-                                if len(current) + len(w) + 1 <= max_chars:
-                                    current = (current + " " + w).strip()
-                                else:
-                                    if current:
-                                        chunks.append(current)
-                                    current = w
-                        else:
-                            current = part
             else:
-                if len(current) + len(sent) + 1 <= max_chars:
-                    current = (current + " " + sent).strip()
-                else:
-                    if current:
-                        chunks.append(current)
-                    current = sent
-        if current:
             chunks.append(current)
-    return [c for c in chunks if c.strip()]
-# ============================================================================
-# AUDIO UTILS
-# ============================================================================
 def _normalize_audio(wav, eps=1e-12, clip=True):
     """Chuẩn hóa âm thanh về float32 trong khoảng [-1, 1]."""
@@ -269,11 +257,11 @@ def wrap_chunk_area(inner_html: str) -> str:
 # HELPER: Preview chunks trước khi xử lý
 # ============================================================================
-def preview_chunks(text: str, chunk_size: int) -> str:
     """Hiển thị preview danh sách chunks sẽ được tạo."""
     if not text or not text.strip():
         return "<p style='color:#9ca3af; font-style:italic; padding:8px;'>Nhập văn bản để xem trước các đoạn...</p>"
-    chunks = split_into_chunks(text.strip(), max_chars=int(chunk_size))
     if not chunks:
         return "<p style='color:#9ca3af;'>Không có đoạn nào.</p>"
     rows = ""
@@ -355,7 +343,7 @@ def _run_chunked(chunks, generate_fn, total):
     yield all_audio, html_blocks, f"✅ Hoàn tất {total} đoạn.", total
-def generate_voice_design_chunked(text, language, voice_description, chunk_size):
     """Tạo giọng nói theo từng chunk - Voice Design (1.7B)."""
     if not text or not text.strip():
         yield None, "Lỗi: Văn bản là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản là bắt buộc.</p>")
@@ -364,7 +352,7 @@ def generate_voice_design_chunked(text, language, voice_description, chunk_size)
         yield None, "Lỗi: Mô tả giọng nói là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Mô tả giọng nói là bắt buộc.</p>")
         return
-    chunks = split_into_chunks(text.strip(), max_chars=int(chunk_size))
     total = len(chunks)
     lang_en = LANGUAGE_MAP.get(language, "Auto")
@@ -392,7 +380,7 @@ def generate_voice_design_chunked(text, language, voice_description, chunk_size)
         yield out_audio, status, wrap_chunk_area("".join(html_blocks))
-def generate_voice_clone_chunked(ref_audio, ref_text, target_text, language, use_xvector_only, model_size, chunk_size):
     """Tạo giọng nói theo từng chunk - Voice Clone (Base)."""
     if not target_text or not target_text.strip():
         yield None, "Lỗi: Văn bản cần đọc là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản cần đọc là bắt buộc.</p>")
@@ -407,7 +395,7 @@ def generate_voice_clone_chunked(ref_audio, ref_text, target_text, language, use
         yield None, "Lỗi: Văn bản tham chiếu là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản tham chiếu là bắt buộc.</p>")
         return
-    chunks = split_into_chunks(target_text.strip(), max_chars=int(chunk_size))
     total = len(chunks)
     lang_en = LANGUAGE_MAP.get(language, "Auto")
@@ -437,7 +425,7 @@ def generate_voice_clone_chunked(ref_audio, ref_text, target_text, language, use
         yield out_audio, status, wrap_chunk_area("".join(html_blocks))
-def generate_custom_voice_chunked(text, language, speaker, instruct, model_size, chunk_size):
     """Tạo giọng nói theo từng chunk - CustomVoice."""
     if not text or not text.strip():
         yield None, "Lỗi: Văn bản là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản là bắt buộc.</p>")
@@ -446,7 +434,7 @@ def generate_custom_voice_chunked(text, language, speaker, instruct, model_size,
         yield None, "Lỗi: Giọng đọc là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Giọng đọc là bắt buộc.</p>")
         return
-    chunks = split_into_chunks(text.strip(), max_chars=int(chunk_size))
     total = len(chunks)
     lang_en = LANGUAGE_MAP.get(language, "Auto")
@@ -479,17 +467,6 @@ def generate_custom_voice_chunked(text, language, speaker, instruct, model_size,
 # UI
 # ============================================================================
-def _chunk_controls():
-    """Controls cài đặt chia đoạn tái sử dụng."""
-    with gr.Accordion("⚙️ Cài đặt chia đoạn", open=False):
-        chunk_size = gr.Slider(
-            label="Số ký tự tối đa mỗi đoạn",
-            minimum=50, maximum=500, value=200, step=10,
-            info="Văn bản tự động chia ở dấu câu gần nhất trước giới hạn này. Số đoạn không giới hạn."
-        )
-    return chunk_size
 def _chunk_output_area(tab_id: str):
     """Output area: audio tổng hợp + status + HTML preview từng chunk."""
     audio_out = gr.Audio(
@@ -548,7 +525,6 @@ Hỗ trợ văn bản **không giới hạn độ dài** — tự động chia c
                             placeholder="Ví dụ: Giọng ngạc nhiên, lo lắng, bắt đầu hoảng loạn...",
                             value="Giọng ngạc nhiên, không tin tưởng, bắt đầu có chút hoảng loạn."
                         )
-                        d_chunk_size = _chunk_controls()
                         with gr.Row():
                             d_preview_btn = gr.Button("🔍 Xem trước các đoạn", variant="secondary")
                             d_gen_btn = gr.Button("▶ Tạo giọng nói", variant="primary", scale=2)
@@ -558,12 +534,12 @@ Hỗ trợ văn bản **không giới hạn độ dài** — tự động chia c
                 d_preview_btn.click(
                     fn=lambda t, cs: preview_chunks(t, cs),
-                    inputs=[d_text, d_chunk_size],
                     outputs=[d_chunk_html],
                 )
                 d_gen_btn.click(
                     fn=generate_voice_design_chunked,
-                    inputs=[d_text, d_language, d_instruct, d_chunk_size],
                     outputs=[d_audio_out, d_status, d_chunk_html],
                 )
@@ -599,7 +575,6 @@ Hỗ trợ văn bản **không giới hạn độ dài** — tự động chia c
                             c_model_size = gr.Dropdown(
                                 label="Kích thước mô hình", choices=MODEL_SIZES, value="0.6B", interactive=True
                             )
-                        c_chunk_size = _chunk_controls()
                         with gr.Row():
                             c_preview_btn = gr.Button("🔍 Xem trước các đoạn", variant="secondary")
                             c_gen_btn = gr.Button("▶ Nhân bản & Tạo", variant="primary", scale=2)
@@ -609,12 +584,12 @@ Hỗ trợ văn bản **không giới hạn độ dài** — tự động chia c
                 c_preview_btn.click(
                     fn=lambda t, cs: preview_chunks(t, cs),
-                    inputs=[c_target_text, c_chunk_size],
                     outputs=[c_chunk_html],
                 )
                 c_gen_btn.click(
                     fn=generate_voice_clone_chunked,
-                    inputs=[c_ref_audio, c_ref_text, c_target_text, c_language, c_xvector, c_model_size, c_chunk_size],
                     outputs=[c_audio_out, c_status, c_chunk_html],
                 )
@@ -651,7 +626,6 @@ Hỗ trợ văn bản **không giới hạn độ dài** — tự động chia c
                         t_model_size = gr.Dropdown(
                             label="Kích thước mô hình", choices=MODEL_SIZES, value="0.6B", interactive=True
                         )
-                        t_chunk_size = _chunk_controls()
                         with gr.Row():
                             t_preview_btn = gr.Button("🔍 Xem trước các đoạn", variant="secondary")
                             t_gen_btn = gr.Button("▶ Tạo giọng nói", variant="primary", scale=2)
@@ -661,12 +635,12 @@ Hỗ trợ văn bản **không giới hạn độ dài** — tự động chia c
                 t_preview_btn.click(
                     fn=lambda t, cs: preview_chunks(t, cs),
-                    inputs=[t_text, t_chunk_size],
                     outputs=[t_chunk_html],
                 )
                 t_gen_btn.click(
                     fn=generate_custom_voice_chunked,
-                    inputs=[t_text, t_language, t_speaker, t_instruct, t_model_size, t_chunk_size],
                     outputs=[t_audio_out, t_status, t_chunk_html],
                 )

 # CHUNK TEXT - Chia văn bản thành các đoạn nhỏ theo câu/đoạn
 # ============================================================================
+def split_into_chunks(text: str) -> list:
     """
+    Chia văn bản thành các chunk thông minh theo dấu câu.
+    Quy tắc:
+      - Tách thành câu tại dấu kết thúc câu (.!?…  và tương đương CJK).
+      - Gom câu ngắn (<60 ký tự) vào chunk hiện tại.
+      - Flush chunk khi đã đủ dài (>= 80 ký tự) và câu tiếp theo cũng tự đứng được (>= 30 ký tự).
+      - Tôn trọng ngắt đoạn (dòng trống) — luôn flush trước đoạn mới.
     """
     text = text.strip()
     if not text:
         return []
+    SENT_SPLIT = re.compile(r'(?<=[.!?\u2026\u3002\uff01\uff1f])\s+')
+    FLUSH_LEN  = 100  # flush chunk khi đạt độ dài này
+    SHORT_SENT = 40   # câu ngắn hơn ngưỡng này luôn được gom vào chunk trước
+    MAX_LEN    = 200  # không để chunk vượt quá ngưỡng này dù câu tiếp có ngắn
+    raw_sents = []
+    for para in re.split(r'\n\s*\n', text):
         para = para.strip()
         if not para:
             continue
+        sents = [s.strip() for s in SENT_SPLIT.split(para) if s.strip()]
+        if sents:
+            # Đánh dấu câu cuối của mỗi đoạn để flush
+            raw_sents.extend(sents[:-1])
+            raw_sents.append(("PARA_END", sents[-1]))
+    if not raw_sents:
+        return [text]
+    chunks = []
+    current = ""
+    for item in raw_sents:
+        is_para_end = isinstance(item, tuple)
+        sent = item[1] if is_para_end else item
+        if not current:
+            current = sent
+        else:
+            combined = current + " " + sent
+            # Gom nếu: chunk hiện tại còn ngắn HOẶC câu tiếp theo quá ngắn để đứng riêng
+            if len(combined) > MAX_LEN:
+                # Câu tiếp quá dài để gom — flush ngay
+                chunks.append(current)
+                current = sent
+            elif len(current) < FLUSH_LEN or len(sent) < SHORT_SENT:
+                current = combined
             else:
+                chunks.append(current)
+                current = sent
+        # Flush tại cuối đoạn
+        if is_para_end and current:
             chunks.append(current)
+            current = ""
+    if current:
+        chunks.append(current)
+    return [c for c in chunks if c.strip()]
 def _normalize_audio(wav, eps=1e-12, clip=True):
     """Chuẩn hóa âm thanh về float32 trong khoảng [-1, 1]."""
 # HELPER: Preview chunks trước khi xử lý
 # ============================================================================
+def preview_chunks(text: str) -> str:
     """Hiển thị preview danh sách chunks sẽ được tạo."""
     if not text or not text.strip():
         return "<p style='color:#9ca3af; font-style:italic; padding:8px;'>Nhập văn bản để xem trước các đoạn...</p>"
+    chunks = split_into_chunks(text.strip())
     if not chunks:
         return "<p style='color:#9ca3af;'>Không có đoạn nào.</p>"
     rows = ""
     yield all_audio, html_blocks, f"✅ Hoàn tất {total} đoạn.", total
+def generate_voice_design_chunked(text, language, voice_description):
     """Tạo giọng nói theo từng chunk - Voice Design (1.7B)."""
     if not text or not text.strip():
         yield None, "Lỗi: Văn bản là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản là bắt buộc.</p>")
         yield None, "Lỗi: Mô tả giọng nói là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Mô tả giọng nói là bắt buộc.</p>")
         return
+    chunks = split_into_chunks(text.strip())
     total = len(chunks)
     lang_en = LANGUAGE_MAP.get(language, "Auto")
         yield out_audio, status, wrap_chunk_area("".join(html_blocks))
+def generate_voice_clone_chunked(ref_audio, ref_text, target_text, language, use_xvector_only, model_size):
     """Tạo giọng nói theo từng chunk - Voice Clone (Base)."""
     if not target_text or not target_text.strip():
         yield None, "Lỗi: Văn bản cần đọc là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản cần đọc là bắt buộc.</p>")
         yield None, "Lỗi: Văn bản tham chiếu là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản tham chiếu là bắt buộc.</p>")
         return
+    chunks = split_into_chunks(target_text.strip())
     total = len(chunks)
     lang_en = LANGUAGE_MAP.get(language, "Auto")
         yield out_audio, status, wrap_chunk_area("".join(html_blocks))
+def generate_custom_voice_chunked(text, language, speaker, instruct, model_size):
     """Tạo giọng nói theo từng chunk - CustomVoice."""
     if not text or not text.strip():
         yield None, "Lỗi: Văn bản là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Văn bản là bắt buộc.</p>")
         yield None, "Lỗi: Giọng đọc là bắt buộc.", wrap_chunk_area("<p style='color:red'>Lỗi: Giọng đọc là bắt buộc.</p>")
         return
+    chunks = split_into_chunks(text.strip())
     total = len(chunks)
     lang_en = LANGUAGE_MAP.get(language, "Auto")
 # UI
 # ============================================================================
 def _chunk_output_area(tab_id: str):
     """Output area: audio tổng hợp + status + HTML preview từng chunk."""
     audio_out = gr.Audio(
                             placeholder="Ví dụ: Giọng ngạc nhiên, lo lắng, bắt đầu hoảng loạn...",
                             value="Giọng ngạc nhiên, không tin tưởng, bắt đầu có chút hoảng loạn."
                         )
                         with gr.Row():
                             d_preview_btn = gr.Button("🔍 Xem trước các đoạn", variant="secondary")
                             d_gen_btn = gr.Button("▶ Tạo giọng nói", variant="primary", scale=2)
                 d_preview_btn.click(
                     fn=lambda t, cs: preview_chunks(t, cs),
+                    inputs=[d_text],
                     outputs=[d_chunk_html],
                 )
                 d_gen_btn.click(
                     fn=generate_voice_design_chunked,
+                    inputs=[d_text, d_language, d_instruct],
                     outputs=[d_audio_out, d_status, d_chunk_html],
                 )
                             c_model_size = gr.Dropdown(
                                 label="Kích thước mô hình", choices=MODEL_SIZES, value="0.6B", interactive=True
                             )
                         with gr.Row():
                             c_preview_btn = gr.Button("🔍 Xem trước các đoạn", variant="secondary")
                             c_gen_btn = gr.Button("▶ Nhân bản & Tạo", variant="primary", scale=2)
                 c_preview_btn.click(
                     fn=lambda t, cs: preview_chunks(t, cs),
+                    inputs=[c_target_text],
                     outputs=[c_chunk_html],
                 )
                 c_gen_btn.click(
                     fn=generate_voice_clone_chunked,
+                    inputs=[c_ref_audio, c_ref_text, c_target_text, c_language, c_xvector, c_model_size],
                     outputs=[c_audio_out, c_status, c_chunk_html],
                 )
                         t_model_size = gr.Dropdown(
                             label="Kích thước mô hình", choices=MODEL_SIZES, value="0.6B", interactive=True
                         )
                         with gr.Row():
                             t_preview_btn = gr.Button("🔍 Xem trước các đoạn", variant="secondary")
                             t_gen_btn = gr.Button("▶ Tạo giọng nói", variant="primary", scale=2)
                 t_preview_btn.click(
                     fn=lambda t, cs: preview_chunks(t, cs),
+                    inputs=[t_text],
                     outputs=[t_chunk_html],
                 )
                 t_gen_btn.click(
                     fn=generate_custom_voice_chunked,
+                    inputs=[t_text, t_language, t_speaker, t_instruct, t_model_size],
                     outputs=[t_audio_out, t_status, t_chunk_html],
                 )