MultilanguageCloner

Running

App Files Files Community

oicui commited on Dec 11, 2025

Commit

96c42e1

verified ·

1 Parent(s): e3826a0

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -61

app.py CHANGED Viewed

@@ -70,38 +70,26 @@ def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | N
     return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
-# ===========================================================
-#              ⭐ THÊM HÀM NGẮT NGHỈ TỰ NHIÊN ⭐
-# ===========================================================
-def natural_pause_text(text: str) -> str:
     """
-    Thêm ngắt nghỉ tự nhiên để AI phát âm giống người thật.
-    - '...' = nghỉ dài
-    - '…'  = nghỉ ngắn
     """
-    # Nghỉ dài sau dấu kết câu
-    text = re.sub(r"([\.!\?])\s+", r"\1 ... ", text)
-    # Nghỉ ngắn sau dấu phẩy
-    text = re.sub(r"(,)\s+", r"\1 … ", text)
-    return text.strip()
-# ===========================================================
-#                   SMART CHUNKING
-# ===========================================================
-def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
     text = re.sub(r"\s+", " ", text.strip())
     if not text:
         return []
     if len(text) <= max_chars:
         return [text]
-    sentences = re.split(r'(?<=[\.!\?…])\s+', text)
     chunks: list[str] = []
     current = ""
@@ -111,10 +99,26 @@ def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
         if not sent:
             continue
         if len(current) + len(sent) + 1 <= max_chars:
             current += sent + " "
         else:
-            chunks.append(current.strip())
             current = sent + " "
     if current:
@@ -123,32 +127,45 @@ def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
     return [c for c in chunks if c]
-# ===========================================================
-#     ⭐ HÀM GHÉP AUDIO CÓ CHÈN KHOẢNG LẶNG GIỐNG NGƯỜI ⭐
-# ===========================================================
-def concat_audio_with_pause(chunks, pause_ms: int = 220, sr: int = 24000):
     """
-    Chèn khoảng lặng (pause) tự nhiên giữa các câu/chunk.
     """
     if not chunks:
         return torch.empty(0)
-    silence = torch.zeros(int(sr * pause_ms / 1000))
-    output = []
-    for i, ch in enumerate(chunks):
-        output.append(ch)
-        # chèn pause giữa các câu
-        if i < len(chunks) - 1:
-            output.append(silence.clone())
-    return torch.cat(output, dim=-1)
-# ===========================================================
-#                       TTS GENERATE
-# ===========================================================
 @spaces.GPU
 def generate_tts_audio(
@@ -165,46 +182,55 @@ def generate_tts_audio(
     if current_model is None:
         raise RuntimeError("TTS model not loaded.")
     if seed_num_input == 0:
         seed_num_input = random.randint(1, 2**32 - 1)
-    set_seed(int(seed_num_input))
-    # ⭐ Thêm ngắt nghỉ vào TEXT
-    text_input = natural_pause_text(text_input)
     chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
-        "audio_prompt_path": chosen_prompt
     }
     chunks = smart_chunk_text(text_input, max_chars=500)
     all_audio: list[torch.Tensor] = []
-    for chunk in chunks:
         wav = current_model.generate(chunk, language_id=language_id, **generate_kwargs)
         all_audio.append(wav.squeeze(0).cpu())
-    # ⭐ Ghép audio có PAUSE giống người
-    final_audio = concat_audio_with_pause(
         all_audio,
-        pause_ms=230,              # Điều chỉnh độ tự nhiên tại đây
         sr=current_model.sr
     )
     return (current_model.sr, final_audio.numpy()), str(seed_num_input)
-# ===========================================================
-#                       GRADIO UI
-# ===========================================================
 with gr.Blocks() as demo:
     gr.Markdown("""
-    # 🎙️ Multi Language Realistic Voice Cloner
-    Now with **Natural Human Pausing System** (Text + Audio Pause)
     """)
     gr.Markdown(get_supported_languages_display())
@@ -212,16 +238,22 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             initial_lang = "en"
-            text = gr.Textbox(value=default_text_for_ui(initial_lang), label="Text to synthesize", lines=8)
             language_id = gr.Dropdown(
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
                 label="Language"
             )
-            ref_wav = gr.Audio(sources=["upload","microphone"], type="filepath",
-                               label="Reference Audio (Optional)",
-                               value=default_audio_for_ui(initial_lang))
             exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration", value=.5)
             cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG Weight", value=0.5)
@@ -231,6 +263,7 @@ with gr.Blocks() as demo:
             run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")
             seed_output = gr.Textbox(label="Seed Used", interactive=False)
@@ -241,13 +274,15 @@ with gr.Blocks() as demo:
         language_id.change(
             fn=on_lang_change,
             inputs=[language_id, ref_wav, text],
-            outputs=[ref_wav, text]
         )
     run_btn.click(
         fn=generate_tts_audio,
         inputs=[text, language_id, ref_wav, exaggeration, temp, seed_num, cfg_weight],
-        outputs=[audio_output, seed_output]
     )
 demo.launch(mcp_server=True, share=True)

     return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
+# ============================
+#  SMART CHUNKING (TỐI ƯU)
+# ============================
+def smart_chunk_text(text: str, max_chars: int = 500) -> list[str]:
     """
+    Chia text thành các đoạn (chunk) ngắn:
+    - Ưu tiên tách theo câu.
+    - Nếu câu quá dài thì tách tiếp theo từ.
+    - Gộp nhiều câu nhỏ vào 1 chunk để giảm số lần gọi model.
     """
+    # Normalize khoảng trắng
     text = re.sub(r"\s+", " ", text.strip())
     if not text:
         return []
     if len(text) <= max_chars:
         return [text]
+    # Hỗ trợ nhiều dấu câu đa ngôn ngữ: . ! ? … ؟ ، : ؛ ।
+    sentences = re.split(r'(?<=[\.!\?…؟،:؛।])\s+', text)
     chunks: list[str] = []
     current = ""
         if not sent:
             continue
+        # Nếu bản thân câu đã dài hơn max_chars -> chia mềm theo từ
+        if len(sent) > max_chars:
+            words = sent.split()
+            temp = ""
+            for w in words:
+                if len(temp) + len(w) + 1 > max_chars:
+                    if temp:
+                        chunks.append(temp.strip())
+                    temp = ""
+                temp += w + " "
+            if temp:
+                chunks.append(temp.strip())
+            continue
+        # Nếu gộp thêm câu mà vẫn không vượt max_chars -> gộp chung
         if len(current) + len(sent) + 1 <= max_chars:
             current += sent + " "
         else:
+            if current:
+                chunks.append(current.strip())
             current = sent + " "
     if current:
     return [c for c in chunks if c]
+def concat_audio_torch(chunks: list[torch.Tensor],
+                       crossfade_ms: int = 10,
+                       sr: int = 24000) -> torch.Tensor:
     """
+    Nối nhiều đoạn audio (1D tensor) bằng crossfade nhẹ để tránh tiếng "click".
     """
     if not chunks:
         return torch.empty(0)
+    if len(chunks) == 1 or crossfade_ms <= 0:
+        return torch.cat(chunks, dim=-1)
+    output = chunks[0]
+    crossfade = int(crossfade_ms * sr / 1000)
+    for i in range(1, len(chunks)):
+        a = output
+        b = chunks[i]
+        # Đảm bảo crossfade không lớn hơn độ dài đoạn
+        cf = min(crossfade, a.shape[-1], b.shape[-1])
+        if cf <= 0:
+            output = torch.cat([a, b], dim=-1)
+            continue
+        fade_out = torch.linspace(1.0, 0.0, steps=cf, device=a.device, dtype=a.dtype)
+        fade_in = torch.linspace(0.0, 1.0, steps=cf, device=b.device, dtype=b.dtype)
+        a_tail = a[..., -cf:] * fade_out
+        b_head = b[..., :cf] * fade_in
+        mixed = a_tail + b_head
+        a_main = a[..., :-cf]
+        b_rest = b[..., cf:]
+        output = torch.cat([a_main, mixed, b_rest], dim=-1)
+    return output
 @spaces.GPU
 def generate_tts_audio(
     if current_model is None:
         raise RuntimeError("TTS model not loaded.")
+    # --- SEED LOGIC ---
     if seed_num_input == 0:
         seed_num_input = random.randint(1, 2**32 - 1)
+        print(f"🌱 Random seed generated: {seed_num_input}")
+    else:
+        print(f"🌱 Using provided seed: {seed_num_input}")
+    set_seed(int(seed_num_input))
     chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
     generate_kwargs = {
         "exaggeration": exaggeration_input,
         "temperature": temperature_input,
         "cfg_weight": cfgw_input,
     }
+    if chosen_prompt:
+        generate_kwargs["audio_prompt_path"] = chosen_prompt
+    # 💡 DÙNG SMART CHUNKING TỐI ƯU
     chunks = smart_chunk_text(text_input, max_chars=500)
+    print(f"📚 Total chunks: {len(chunks)}")
     all_audio: list[torch.Tensor] = []
+    for idx, chunk in enumerate(chunks, start=1):
+        print(f"🎧 Rendering chunk {idx}/{len(chunks)} (len={len(chunk)} chars)")
         wav = current_model.generate(chunk, language_id=language_id, **generate_kwargs)
         all_audio.append(wav.squeeze(0).cpu())
+    # 🔗 NỐI AUDIO VỚI CROSSFADE NHẸ
+    final_audio = concat_audio_torch(
         all_audio,
+        crossfade_ms=12,
         sr=current_model.sr
     )
+    # RETURN AUDIO + SEED
     return (current_model.sr, final_audio.numpy()), str(seed_num_input)
+# ============================
+#  GRADIO UI
+# ============================
 with gr.Blocks() as demo:
     gr.Markdown("""
+    # 🎙️ Multi Language Realistic Voice Cloner
+    Generate long-form multilingual speech with reference audio styling and smart chunking (crossfaded).
     """)
     gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
             initial_lang = "en"
+            text = gr.Textbox(
+                value=default_text_for_ui(initial_lang),
+                label="Text to synthesize",
+                lines=8
+            )
             language_id = gr.Dropdown(
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
                 label="Language"
             )
+            ref_wav = gr.Audio(
+                sources=["upload", "microphone"],
+                type="filepath",
+                label="Reference Audio (Optional)",
+                value=default_audio_for_ui(initial_lang)
+            )
             exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration", value=.5)
             cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG Weight", value=0.5)
             run_btn = gr.Button("Generate", variant="primary")
+        # OUTPUT COLUMN
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")
             seed_output = gr.Textbox(label="Seed Used", interactive=False)
         language_id.change(
             fn=on_lang_change,
             inputs=[language_id, ref_wav, text],
+            outputs=[ref_wav, text],
+            show_progress=False
         )
+    # CONNECT BUTTON
     run_btn.click(
         fn=generate_tts_audio,
         inputs=[text, language_id, ref_wav, exaggeration, temp, seed_num, cfg_weight],
+        outputs=[audio_output, seed_output],
     )
 demo.launch(mcp_server=True, share=True)