Spaces:

kjanh
/

text_to_speech_Vietnamese

Runtime error

App Files Files Community

3v324v23 commited on 28 days ago

Commit

af11ce4

0 Parent(s):

up

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +36 -0
LICENSE +201 -0
README.md +9 -0
T_English.wav +3 -0
T_English_output.wav +3 -0
Toại.wav +3 -0
Toại_output.wav +3 -0
Trung.wav +3 -0
Trung_output.wav +3 -0
app.py +346 -0
assets/silence.wav +3 -0
download_models.py +38 -0
egs/zipvoice/README.md +15 -0
egs/zipvoice/conf/zipvoice_base.json +26 -0
egs/zipvoice/local/pinyin.txt +1550 -0
egs/zipvoice/local/prepare_emilia.sh +149 -0
egs/zipvoice/local/prepare_libritts.sh +100 -0
egs/zipvoice/local/prepare_token_file_char.py +67 -0
egs/zipvoice/local/prepare_token_file_emilia.py +91 -0
egs/zipvoice/local/prepare_tokens_emilia.py +88 -0
egs/zipvoice/local/preprocess_emilia.py +210 -0
egs/zipvoice/run_custom.sh +138 -0
egs/zipvoice/run_emilia.sh +178 -0
egs/zipvoice/run_eval.sh +142 -0
egs/zipvoice/run_finetune.sh +175 -0
egs/zipvoice/run_libritts.sh +148 -0
egs/zipvoice/utils/parse_options.sh +97 -0
egs/zipvoice/utils/validate_manifest.py +70 -0
egs/zipvoice_dialog/README.md +12 -0
egs/zipvoice_dialog/local/prepare_opendialog.py +262 -0
egs/zipvoice_dialog/run_custom.sh +145 -0
egs/zipvoice_dialog/run_eval.sh +120 -0
egs/zipvoice_dialog/run_finetune.sh +135 -0
egs/zipvoice_dialog/run_opendialog.sh +122 -0
infer.py +578 -0
proccess_wav.py +364 -0
pyproject.toml +5 -0
requirements.txt +23 -0
requirements_eval.txt +19 -0
setup.py +55 -0
zipvoice/__init__.py +7 -0
zipvoice/bin/compute_fbank.py +272 -0
zipvoice/bin/generate_averaged_model.py +229 -0
zipvoice/bin/infer_zipvoice.py +614 -0
zipvoice/bin/infer_zipvoice_dialog.py +756 -0
zipvoice/bin/infer_zipvoice_onnx.py +712 -0
zipvoice/bin/onnx_export.py +410 -0
zipvoice/bin/prepare_dataset.py +274 -0
zipvoice/bin/prepare_tokens.py +102 -0
zipvoice/bin/train_zipvoice.py +1136 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+license: cc-by-sa-4.0
+title: Text_to_speech_Vietnamese
+sdk: gradio
+emoji: 🚀
+colorFrom: red
+colorTo: yellow
+pinned: false
+---

T_English.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6ffd499fdf637243bdd630cb52660635d5c7cb580b87f52aa7efca90a33311f
+size 328364

T_English_output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:756daab105a8f4508e6d4c237f1e72a25ec326ad1f6665dce974f96e9b86db7a
+size 954742

Toại.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:798c60882fa0e9a758fd72cf506e6b71293c5ccb5ed27b92569d042a23624bdc
+size 200782

Toại_output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3916769643ff756700e08ae355a0f7e30fd7a0e5299b06a866facad5ff31afd1
+size 2154910

Trung.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bf7087aa7452978b6ec6c25eb8c078eb4ca337660c9d3e3661b8017da9238e9
+size 199376

Trung_output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:068c6cfd893846473d177b9636b4689119a0c0ee7e19f4079cd6c98e27bb94a3
+size 745196

app.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import spaces
+import os
+from download_models import download_all_models
+from huggingface_hub import login
+import gradio as gr
+# ======================= HF LOGIN & DOWNLOAD MODEL =======================
+hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    login(token=hf_token)
+# Tải model khi Space ACTIVE
+download_all_models()
+from infer import run_zipvoice
+# NEW: ASR + DENOISE
+from chunkformer import ChunkFormerModel
+from clearvoice import ClearVoice
+from proccess_wav import enhance_ref_audio, transcribe_ref_audio
+# (Nếu 2 dòng test này không cần thì bạn có thể xoá bớt cho nhẹ)
+enhanced = enhance_ref_audio("Toại.wav")
+text = transcribe_ref_audio(enhanced)
+def infer_ref_text_ui(ref_audio_path: str) -> str:
+    """
+    Dùng cho nút 'Infer Text':
+    - Enhance WAV (ClearVoice + xử lý khoảng lặng + cắt 5–10s)
+    - ASR theo khoảng lặng
+    - Đổ kết quả vào ô Reference Text
+    """
+    if not ref_audio_path:
+        raise gr.Error("Vui lòng upload file giọng mẫu trước khi infer text.")
+    try:
+        enhanced = enhance_ref_audio(ref_audio_path)
+        text = transcribe_ref_audio(enhanced)
+    except Exception as e:
+        raise gr.Error(f"Lỗi khi nhận dạng từ audio tham chiếu: {e}")
+    if not text:
+        raise gr.Error("Không nhận dạng được nội dung từ audio tham chiếu.")
+    return text
+# ======================= CẤU HÌNH DEMO SẴN =======================
+SAMPLE_CONFIGS = [
+    {
+        "name": "Sample 1 – Kể chuyện",
+        "ref_audio": "Toại.wav",
+        "ref_text": "Trong bóng tối, Toại nói cái gì đó mà Thoan không nghe thấy.",
+        "gen_text": "Đêm nay trời nhiều mây, ánh trăng bị che khuất, chỉ còn lại một dải sáng yếu ớt rơi xuống con đường đất trải dài giữa cánh đồng. Cậu bé tên Tín đang dắt chiếc xe đạp cũ đi về nhà, bánh xe bị cán đinh nên lăn nặng và chậm như con trâu mệt nhọc sau vụ mùa. Gió thổi lạnh buốt, mùi bùn đất ngai ngái quấn lấy chân cậu. Tới đoạn rẽ dẫn vào xóm, Tín nghe tiếng nước chảy khe khẽ từ con mương bên đường. Tiếng ấy vẫn quen thuộc, nhưng tối nay lại vang khác lạ, như có giọng người đang hòa vào nhịp nước, lúc trầm lúc cao, nghe mơ hồ mà lạnh sống lưng. Cậu dừng lại, nghiêng tai lắng nghe, tim đập nhanh như muốn vượt khỏi lồng ngực.",
+        "out_audio": "Toại_output.wav",
+    },
+    {
+        "name": "Sample 2 – Nữ",
+        "ref_audio": "Trung.wav",
+        "ref_text": "Mùa hè không chỉ là khoảng thời gian nghỉ ngơi, mà còn là khoảng thời gian tuyệt vời.",
+        "gen_text": "Từ các kết quả này, chúng tôi đề xuất rằng sự kết hợp nhuần nhuyễn giữa adaptive optimization, robust training pipelines và interpretable model design sẽ là chìa khóa để phát triển các hệ thống ây ai vừa mạnh mẽ vừa đáng tin cậy trong môi trường thực tế.",
+        "out_audio": "Trung_output.wav",
+    },
+    {
+        "name": "Sample 3 – English",
+        "ref_audio": "T_English.wav",
+        "ref_text": "And turning to the pole which he had dragged, He drew it close beneath the widowed bough, And what was of it unto it left bound.",
+        "gen_text": "Recent experiments indicate that the current model architecture still exhibits significant overfitting, especially when evaluated on out of distribution samples. Although the training accuracy remains consistently high, the performance drops sharply when the model is exposed to noise perturbed inputs, suggesting limited robustness.",
+        "out_audio": "T_English_output.wav",
+    },
+]
+# Hàm dùng khi bấm "Dùng sample này"
+def make_sample_loader(sample):
+    def _load_sample():
+        return (
+            sample["ref_audio"],  # ref_audio -> input Audio
+            sample["ref_text"],   # ref_text -> Textbox
+            sample["gen_text"],   # gen_text -> Textbox
+            sample["out_audio"],  # output_audio -> Audio
+        )
+    return _load_sample
+# ======================= STYLE TÙY CHỈNH (LÀM SÁNG HƠN) =======================
+custom_css = """
+#app-container {
+    max-width: 1000px;
+    margin: 0 auto;
+}
+.gradio-container {
+    background: radial-gradient(circle at top, #ffffff 0, #f9fafb 55%);
+    color: #111827;
+}
+/* Tiêu đề lớn */
+#title-block h1 {
+    font-size: 2.4rem !important;
+    font-weight: 800 !important;
+    background: linear-gradient(120deg, #f97316, #eab308, #22c55e);
+    -webkit-background-clip: text;
+    color: transparent;
+    text-align: center;
+}
+#title-block p {
+    text-align:center;
+    font-size: 0.95rem;
+    color: #6b7280;
+}
+/* Card sáng hơn */
+.sample-card {
+    border-radius: 16px;
+    padding: 16px;
+    background: rgba(255, 255, 255, 0.96);
+    border: 1px solid rgba(148, 163, 184, 0.6);
+    box-shadow: 0 18px 28px rgba(148, 163, 184, 0.35);
+}
+/* Nút bấm */
+button.primary {
+    border-radius: 999px !important;
+    font-weight: 600 !important;
+}
+/* Tabs */
+.svelte-1ipelgc, .tabitem {
+    font-weight: 600;
+}
+"""
+# ======================= XỬ LÝ TEXT (NẾU CẦN) =======================
+def post_process(text: str) -> str:
+    text = " " + text + " "
+    text = text.replace(" . . ", " . ")
+    text = " " + text + " "
+    text = text.replace(" .. ", " . ")
+    text = " " + text + " "
+    text = text.replace(" , , ", " , ")
+    text = " " + text + " "
+    text = text.replace(" ,, ", " , ")
+    text = " " + text + " "
+    text = text.replace('"', "")
+    return " ".join(text.split())
+@spaces.GPU
+def infer_tts(ref_audio_path, ref_text, gen_text, steps, request: gr.Request = None):
+    if not ref_audio_path:
+        raise gr.Error("Please upload a sample audio file.")
+    if not gen_text.strip():
+        raise gr.Error("Please enter the text content to generate voice.")
+    # Giới hạn độ dài nội dung (4000 từ)
+    if len(gen_text.split()) > 4000:
+        raise gr.Error("Please enter text content with less than 4000 words.")
+    # 1) Enhance ref audio: clearvoice + xử lý khoảng lặng + cắt 5–10s
+    try:
+        enhanced_ref_audio = enhance_ref_audio(ref_audio_path)
+    except Exception as e:
+        raise gr.Error(f"Lỗi khi xử lý audio tham chiếu: {e}")
+    # 2) Nếu không có ref_text thì chạy ASR theo khoảng lặng
+    if not ref_text or not ref_text.strip():
+        try:
+            inferred = transcribe_ref_audio(enhanced_ref_audio)
+            if not inferred:
+                raise gr.Error(
+                    "Không nhận dạng được nội dung từ audio tham chiếu. "
+                    "Vui lòng nhập Reference Text thủ công."
+                )
+            ref_text = inferred
+            print(f"[ASR] Inferred ref_text: {ref_text}")
+        except gr.Error:
+            raise
+        except Exception as e:
+            raise gr.Error(f"Lỗi khi tự động nhận dạng Reference Text: {e}")
+    try:
+        out_path = "result.wav"
+        run_zipvoice(
+            model_name="zipvoice",
+            prompt_wav=enhanced_ref_audio,  # dùng file đã xử lý
+            prompt_text=ref_text.strip() if ref_text else "xin chào các bạn",
+            text=gen_text,
+            res_wav_path=out_path,
+            lang="vi",
+            tokenizer_name="espeak",
+            num_step=steps,
+            seed=123456,
+            speed=1.0,
+        )
+        return out_path
+    except Exception as e:
+        raise gr.Error(f"Error generating voice: {e}")
+# ======================= UI =======================
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
+    with gr.Column(elem_id="app-container"):
+        # --------- TIÊU ĐỀ ----------
+        gr.Markdown(
+            """
+<div id="title-block">
+  <h1>🎤 ZipVoice – Zero-shot Vietnamese TTS</h1>
+  <p>Upload một mẫu giọng + nhập nội dung &rarr; hệ thống sẽ bắt chước giọng nói và đọc đoạn text của bạn.</p>
+</div>
+            """,
+            elem_id="title-block",
+        )
+        with gr.Tabs():
+            # Chỉ còn 1 tab chính, demo cũng nằm trong tab này
+            with gr.TabItem("🎯 Tự tạo giọng nói"):
+                # --------- KHỐI INPUT / OUTPUT CHÍNH ----------
+                with gr.Row():
+                    with gr.Column(elem_classes=["sample-card"]):
+                        gr.Markdown("#### 1️⃣ Tải giọng mẫu & nhập text")
+                        ref_audio = gr.Audio(
+                            label="🔊 Sample Voice (upload hoặc kéo thả)",
+                            type="filepath",
+                        )
+                        ref_text = gr.Textbox(
+                            label="📝 Reference Text (optional)",
+                            placeholder="Nội dung đang được nói trong file giọng mẫu (nên tự viết cho chính xác)",
+                            lines=3,
+                        )
+                        # Nút infer text từ audio tham chiếu (ASR + khử nhiễu)
+                        btn_infer_text = gr.Button(
+                            "✨ Infer Text từ audio tham chiếu"
+                        )
+                        gen_text = gr.Textbox(
+                            label="📝 Text to Generate",
+                            placeholder="Nhập nội dung tiếng Việt bạn muốn tổng hợp...",
+                            lines=6,
+                        )
+                        steps = gr.Slider(
+                            8,
+                            64,
+                            value=25,
+                            step=1,
+                            label="⚡ Step (càng lớn, càng tốt, càng lâu)",
+                        )
+                        btn_synthesize = gr.Button(
+                            "🔥 Generate Voice",
+                            variant="primary",
+                        )
+                    with gr.Column(elem_classes=["sample-card"]):
+                        gr.Markdown("#### 2️⃣ Kết quả tổng hợp")
+                        output_audio = gr.Audio(
+                            label="🎧 Generated Audio",
+                            type="filepath",
+                        )
+                        gr.Markdown(
+                            """
+- Bạn có thể tải file `.wav` về sau khi tạo.
+- Nếu nghe chưa ổn, hãy thử:
+  - Dùng **ref audio ngắn 3-8s, phát âm chuẩn hơn.
+                            """
+                        )
+                # mapping nút Generate -> infer_tts
+                btn_synthesize.click(
+                    infer_tts,
+                    inputs=[ref_audio, ref_text, gen_text, steps],
+                    outputs=[output_audio],
+                )
+                # mapping nút Infer Text -> điền ref_text (có khử nhiễu trước)
+                btn_infer_text.click(
+                    infer_ref_text_ui,
+                    inputs=[ref_audio],
+                    outputs=[ref_text],
+                )
+                # --------- KHỐI DEMO NẰM NGAY TRONG TAB CHÍNH ----------
+                gr.Markdown(
+                    """
+### 🎧 Demo có sẵn
+Click vào một sample bên dưới để tự động nạp:
+- 🔊 Giọng mẫu (ref voice)
+- 📝 Reference text
+- 📝 Text to generate
+- 🎧 Output audio mẫu
+                    """
+                )
+                for sample in SAMPLE_CONFIGS:
+                    with gr.Column(elem_classes=["sample-card"]):
+                        gr.Markdown(f"### {sample['name']}")
+                        with gr.Row():
+                            gr.Audio(
+                                value=sample["ref_audio"],
+                                label="🔊 Reference Voice",
+                                interactive=False,
+                            )
+                            gr.Textbox(
+                                value=sample["ref_text"],
+                                label="📝 Reference Text",
+                                interactive=False,
+                                lines=3,
+                            )
+                        gr.Audio(
+                            value=sample["out_audio"],
+                            label="🎧 Generated Sample (TTS)",
+                            interactive=False,
+                        )
+                        if sample.get("gen_text"):
+                            gr.Markdown(
+                                f"**Text dùng để synth:** {sample['gen_text']}"
+                            )
+                        # Nút này sẽ fill luôn ref_audio, ref_text, gen_text, output_audio
+                        use_btn = gr.Button(f"➡️ Dùng {sample['name']}")
+                        use_btn.click(
+                            make_sample_loader(sample),
+                            inputs=[],
+                            outputs=[ref_audio, ref_text, gen_text, output_audio],
+                        )
+        gr.Markdown(
+            """
+### ⚠️ Model Limitations
+1. Có thể xử lý chưa tốt với số, ngày tháng, ký tự đặc biệt.
+2. Nhịp điệu đôi khi chưa tự nhiên.
+3. Chất lượng phụ thuộc khá nhiều vào chất lượng ref audio.
+"""
+        )
+demo.queue().launch()

assets/silence.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca5a251f2d1439929f1c6b44d98299e53d402da45306af79cbfab5005501fed9
+size 4800044

download_models.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+import requests
+MODEL_DIR = "zipvoice_finetune"
+os.makedirs(MODEL_DIR, exist_ok=True)
+files = {
+    "iter-525000-avg-2.pt": "https://huggingface.co/datasets/kjanh/demo_zip/resolve/main/epoch-46-all-speak-600h-en-norm.pt",
+    "model.json": "https://huggingface.co/datasets/kjanh/demo_zip/resolve/main/model.json",
+    "tokens.txt": "https://huggingface.co/datasets/kjanh/demo_zip/resolve/main/tokens.txt",
+}
+HF_TOKEN = os.getenv("HF_TOKEN")
+def download_with_token(url, dest_path):
+    if os.path.exists(dest_path):
+        print(f"✔ File tồn tại: {dest_path}")
+        return
+    if HF_TOKEN is None:
+        raise RuntimeError("❌ Missing HF_TOKEN in Secrets!")
+    print(f"⬇ Downloading {dest_path} ...")
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    r = requests.get(url, headers=headers, stream=True)
+    r.raise_for_status()
+    with open(dest_path, "wb") as f:
+        for chunk in r.iter_content(1024 * 1024):
+            f.write(chunk)
+    print(f"✅ Downloaded {dest_path}")
+# demo
+def download_all_models():
+    for filename, url in files.items():
+        dest = os.path.join(MODEL_DIR, filename)
+        download_with_token(url, dest)

egs/zipvoice/README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# ZipVoice Recipe
+This recipe contains the following examples:
+- Training ZipVoice on Emilia from scratch, see [run_emilia.sh](run_emilia.sh)
+- Training ZipVoice on LibriTTS from scratch, see [run_libritts.sh](run_libritts.sh).
+- Training ZipVoice on custom datasets (any language) from scratch, see [run_custom.sh](run_custom.sh).
+- Fine-tuning pre-trained ZipVoice on custom datasets (any language), see [run_finetune.sh](run_finetune.sh).
+- Evaluate TTS models with objective metrics reported in ZipVoice paper, see [run_eval.sh](run_eval.sh).
+> **NOTE:**  [run_emilia.sh](run_emilia.sh) is the most complete example, which covers: data preparation, ZipVoice trainnig, ZipVoice-Distill training, onnx export, and inference with all PyTorch and ONNX models.
+>  **NOTE:** For evaluation, first install packages from [../../requirements_eval.txt](../../requirements_eval.txt)
+>
+> `pip install -r ../../requirements_eval.txt`

egs/zipvoice/conf/zipvoice_base.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "model" : {
+    "fm_decoder_downsampling_factor" : [1,2,4,2,1],
+    "fm_decoder_num_layers" : [2,2,4,4,4],
+    "fm_decoder_cnn_module_kernel" : [31,15,7,15,31],
+    "fm_decoder_feedforward_dim" : 1536,
+    "fm_decoder_num_heads" : 4,
+    "fm_decoder_dim" : 512,
+    "text_encoder_num_layers" : 4,
+    "text_encoder_feedforward_dim" : 512,
+    "text_encoder_cnn_module_kernel" : 9,
+    "text_encoder_num_heads" : 4,
+    "text_encoder_dim" : 192,
+    "query_head_dim" : 32,
+    "value_head_dim" : 12,
+    "pos_head_dim" : 4,
+    "pos_dim" : 48,
+    "time_embed_dim" : 192,
+    "text_embed_dim" : 192,
+    "feat_dim": 100
+  },
+  "feature" : {
+    "sampling_rate": 24000,
+    "type": "vocos"
+  }
+}

egs/zipvoice/local/pinyin.txt ADDED Viewed

	@@ -0,0 +1,1550 @@

+a
+a1
+a2
+a3
+a4
+ai1
+ai2
+ai3
+ai4
+an1
+an2
+an3
+an4
+ang1
+ang2
+ang3
+ang4
+ao1
+ao2
+ao3
+ao4
+ba
+ba1
+ba2
+ba3
+ba4
+bai
+bai1
+bai2
+bai3
+bai4
+ban
+ban1
+ban3
+ban4
+bang1
+bang3
+bang4
+bao1
+bao2
+bao3
+bao4
+bei
+bei1
+bei3
+bei4
+ben1
+ben3
+ben4
+beng
+beng1
+beng2
+beng3
+beng4
+bi1
+bi2
+bi3
+bi4
+bian
+bian1
+bian3
+bian4
+biang2
+biao1
+biao3
+biao4
+bie1
+bie2
+bie3
+bie4
+bin
+bin1
+bin3
+bin4
+bing1
+bing3
+bing4
+bo
+bo1
+bo2
+bo3
+bo4
+bu1
+bu2
+bu3
+bu4
+ca1
+ca3
+ca4
+cai1
+cai2
+cai3
+cai4
+can1
+can2
+can3
+can4
+cang1
+cang2
+cang3
+cang4
+cao1
+cao2
+cao3
+cao4
+ce4
+cei4
+cen1
+cen2
+ceng1
+ceng2
+ceng4
+cha1
+cha2
+cha3
+cha4
+chai1
+chai2
+chai3
+chai4
+chan1
+chan2
+chan3
+chan4
+chang
+chang1
+chang2
+chang3
+chang4
+chao1
+chao2
+chao3
+chao4
+che1
+che2
+che3
+che4
+chen
+chen1
+chen2
+chen3
+chen4
+cheng1
+cheng2
+cheng3
+cheng4
+chi
+chi1
+chi2
+chi3
+chi4
+chong1
+chong2
+chong3
+chong4
+chou1
+chou2
+chou3
+chou4
+chu
+chu1
+chu2
+chu3
+chu4
+chua1
+chua3
+chua4
+chuai1
+chuai2
+chuai3
+chuai4
+chuan1
+chuan2
+chuan3
+chuan4
+chuang1
+chuang2
+chuang3
+chuang4
+chui1
+chui2
+chui3
+chui4
+chun1
+chun2
+chun3
+chuo1
+chuo4
+ci1
+ci2
+ci3
+ci4
+cong1
+cong2
+cong3
+cong4
+cou1
+cou2
+cou3
+cou4
+cu1
+cu2
+cu3
+cu4
+cuan1
+cuan2
+cuan4
+cui
+cui1
+cui3
+cui4
+cun1
+cun2
+cun3
+cun4
+cuo1
+cuo2
+cuo3
+cuo4
+da
+da1
+da2
+da3
+da4
+dai
+dai1
+dai3
+dai4
+dan1
+dan3
+dan4
+dang
+dang1
+dang3
+dang4
+dao1
+dao2
+dao3
+dao4
+de
+de1
+de2
+dei1
+dei3
+den4
+deng1
+deng3
+deng4
+di1
+di2
+di3
+di4
+dia3
+dian1
+dian2
+dian3
+dian4
+diao1
+diao3
+diao4
+die1
+die2
+die3
+die4
+din4
+ding1
+ding3
+ding4
+diu1
+dong1
+dong3
+dong4
+dou1
+dou3
+dou4
+du1
+du2
+du3
+du4
+duan1
+duan3
+duan4
+dui1
+dui3
+dui4
+dun1
+dun3
+dun4
+duo
+duo1
+duo2
+duo3
+duo4
+e
+e1
+e2
+e3
+e4
+ei1
+ei2
+ei3
+ei4
+en1
+en3
+en4
+eng1
+er
+er2
+er3
+er4
+fa
+fa1
+fa2
+fa3
+fa4
+fan1
+fan2
+fan3
+fan4
+fang
+fang1
+fang2
+fang3
+fang4
+fei1
+fei2
+fei3
+fei4
+fen1
+fen2
+fen3
+fen4
+feng1
+feng2
+feng3
+feng4
+fiao4
+fo2
+fou1
+fou2
+fou3
+fu
+fu1
+fu2
+fu3
+fu4
+ga1
+ga2
+ga3
+ga4
+gai1
+gai3
+gai4
+gan1
+gan3
+gan4
+gang1
+gang3
+gang4
+gao1
+gao3
+gao4
+ge1
+ge2
+ge3
+ge4
+gei3
+gen1
+gen2
+gen3
+gen4
+geng1
+geng3
+geng4
+gong
+gong1
+gong3
+gong4
+gou1
+gou3
+gou4
+gu
+gu1
+gu2
+gu3
+gu4
+gua1
+gua2
+gua3
+gua4
+guai1
+guai3
+guai4
+guan1
+guan3
+guan4
+guang
+guang1
+guang3
+guang4
+gui1
+gui3
+gui4
+gun3
+gun4
+guo
+guo1
+guo2
+guo3
+guo4
+ha1
+ha2
+ha3
+ha4
+hai
+hai1
+hai2
+hai3
+hai4
+han
+han1
+han2
+han3
+han4
+hang1
+hang2
+hang3
+hang4
+hao1
+hao2
+hao3
+hao4
+he1
+he2
+he3
+he4
+hei1
+hen1
+hen2
+hen3
+hen4
+heng1
+heng2
+heng4
+hm
+hng
+hong1
+hong2
+hong3
+hong4
+hou1
+hou2
+hou3
+hou4
+hu
+hu1
+hu2
+hu3
+hu4
+hua1
+hua2
+hua4
+huai
+huai2
+huai4
+huan1
+huan2
+huan3
+huan4
+huang
+huang1
+huang2
+huang3
+huang4
+hui
+hui1
+hui2
+hui3
+hui4
+hun1
+hun2
+hun3
+hun4
+huo
+huo1
+huo2
+huo3
+huo4
+ji1
+ji2
+ji3
+ji4
+jia
+jia1
+jia2
+jia3
+jia4
+jian
+jian1
+jian3
+jian4
+jiang
+jiang1
+jiang3
+jiang4
+jiao
+jiao1
+jiao2
+jiao3
+jiao4
+jie
+jie1
+jie2
+jie3
+jie4
+jin1
+jin3
+jin4
+jing
+jing1
+jing3
+jing4
+jiong1
+jiong3
+jiong4
+jiu
+jiu1
+jiu2
+jiu3
+jiu4
+ju
+ju1
+ju2
+ju3
+ju4
+juan1
+juan3
+juan4
+jue1
+jue2
+jue3
+jue4
+jun1
+jun3
+jun4
+ka1
+ka3
+kai1
+kai3
+kai4
+kan1
+kan3
+kan4
+kang1
+kang2
+kang3
+kang4
+kao1
+kao3
+kao4
+ke
+ke1
+ke2
+ke3
+ke4
+kei1
+ken1
+ken3
+ken4
+keng1
+keng3
+kong1
+kong3
+kong4
+kou1
+kou3
+kou4
+ku1
+ku2
+ku3
+ku4
+kua1
+kua3
+kua4
+kuai3
+kuai4
+kuan1
+kuan3
+kuang1
+kuang2
+kuang3
+kuang4
+kui1
+kui2
+kui3
+kui4
+kun
+kun1
+kun3
+kun4
+kuo4
+la
+la1
+la2
+la3
+la4
+lai2
+lai3
+lai4
+lan2
+lan3
+lan4
+lang
+lang1
+lang2
+lang3
+lang4
+lao
+lao1
+lao2
+lao3
+lao4
+le
+le1
+le4
+lei
+lei1
+lei2
+lei3
+lei4
+len4
+leng1
+leng2
+leng3
+leng4
+li
+li1
+li2
+li3
+li4
+lia3
+lian2
+lian3
+lian4
+liang
+liang2
+liang3
+liang4
+liao1
+liao2
+liao3
+liao4
+lie
+lie1
+lie2
+lie3
+lie4
+lin1
+lin2
+lin3
+lin4
+ling
+ling1
+ling2
+ling3
+ling4
+liu1
+liu2
+liu3
+liu4
+lo
+long1
+long2
+long3
+long4
+lou
+lou1
+lou2
+lou3
+lou4
+lu
+lu1
+lu2
+lu3
+lu4
+luan2
+luan3
+luan4
+lun1
+lun2
+lun3
+lun4
+luo
+luo1
+luo2
+luo3
+luo4
+lv2
+lv3
+lv4
+lve3
+lve4
+m1
+m2
+m4
+ma
+ma1
+ma2
+ma3
+ma4
+mai2
+mai3
+mai4
+man1
+man2
+man3
+man4
+mang1
+mang2
+mang3
+mang4
+mao1
+mao2
+mao3
+mao4
+me
+me1
+mei2
+mei3
+mei4
+men
+men1
+men2
+men4
+meng
+meng1
+meng2
+meng3
+meng4
+mi1
+mi2
+mi3
+mi4
+mian2
+mian3
+mian4
+miao1
+miao2
+miao3
+miao4
+mie
+mie1
+mie2
+mie4
+min
+min2
+min3
+ming
+ming2
+ming3
+ming4
+miu3
+miu4
+mo
+mo1
+mo2
+mo3
+mo4
+mou1
+mou2
+mou3
+mou4
+mu2
+mu3
+mu4
+n
+n2
+n3
+n4
+na
+na1
+na2
+na3
+na4
+nai2
+nai3
+nai4
+nan1
+nan2
+nan3
+nan4
+nang
+nang1
+nang2
+nang3
+nang4
+nao1
+nao2
+nao3
+nao4
+ne
+ne2
+ne4
+nei2
+nei3
+nei4
+nen4
+neng2
+neng3
+neng4
+ng
+ng2
+ng3
+ng4
+ni1
+ni2
+ni3
+ni4
+nia1
+nian1
+nian2
+nian3
+nian4
+niang2
+niang3
+niang4
+niao3
+niao4
+nie1
+nie2
+nie3
+nie4
+nin
+nin2
+nin3
+ning2
+ning3
+ning4
+niu1
+niu2
+niu3
+niu4
+nong2
+nong3
+nong4
+nou2
+nou3
+nou4
+nu2
+nu3
+nu4
+nuan2
+nuan3
+nuan4
+nun2
+nun4
+nuo2
+nuo3
+nuo4
+nv2
+nv3
+nv4
+nve4
+o
+o1
+o2
+o3
+o4
+ou
+ou1
+ou2
+ou3
+ou4
+pa1
+pa2
+pa3
+pa4
+pai1
+pai2
+pai3
+pai4
+pan1
+pan2
+pan3
+pan4
+pang1
+pang2
+pang3
+pang4
+pao1
+pao2
+pao3
+pao4
+pei1
+pei2
+pei3
+pei4
+pen1
+pen2
+pen3
+pen4
+peng1
+peng2
+peng3
+peng4
+pi1
+pi2
+pi3
+pi4
+pian1
+pian2
+pian3
+pian4
+piao1
+piao2
+piao3
+piao4
+pie1
+pie3
+pie4
+pin1
+pin2
+pin3
+pin4
+ping1
+ping2
+ping3
+ping4
+po
+po1
+po2
+po3
+po4
+pou1
+pou2
+pou3
+pou4
+pu
+pu1
+pu2
+pu3
+pu4
+qi
+qi1
+qi2
+qi3
+qi4
+qia1
+qia2
+qia3
+qia4
+qian
+qian1
+qian2
+qian3
+qian4
+qiang1
+qiang2
+qiang3
+qiang4
+qiao1
+qiao2
+qiao3
+qiao4
+qie1
+qie2
+qie3
+qie4
+qin1
+qin2
+qin3
+qin4
+qing
+qing1
+qing2
+qing3
+qing4
+qiong1
+qiong2
+qiong4
+qiu1
+qiu2
+qiu3
+qiu4
+qu
+qu1
+qu2
+qu3
+qu4
+quan
+quan1
+quan2
+quan3
+quan4
+que1
+que2
+que4
+qun1
+qun2
+qun3
+ran2
+ran3
+ran4
+rang1
+rang2
+rang3
+rang4
+rao2
+rao3
+rao4
+re2
+re3
+re4
+ren2
+ren3
+ren4
+reng1
+reng2
+reng4
+ri4
+rong
+rong1
+rong2
+rong3
+rong4
+rou2
+rou3
+rou4
+ru
+ru2
+ru3
+ru4
+rua2
+ruan2
+ruan3
+ruan4
+rui2
+rui3
+rui4
+run2
+run3
+run4
+ruo2
+ruo4
+sa
+sa1
+sa3
+sa4
+sai1
+sai3
+sai4
+san
+san1
+san3
+san4
+sang1
+sang3
+sang4
+sao1
+sao3
+sao4
+se1
+se4
+sen1
+sen3
+seng1
+seng4
+sha
+sha1
+sha2
+sha3
+sha4
+shai1
+shai3
+shai4
+shan1
+shan2
+shan3
+shan4
+shang
+shang1
+shang3
+shang4
+shao1
+shao2
+shao3
+shao4
+she1
+she2
+she3
+she4
+shei2
+shen1
+shen2
+shen3
+shen4
+sheng1
+sheng2
+sheng3
+sheng4
+shi
+shi1
+shi2
+shi3
+shi4
+shou
+shou1
+shou2
+shou3
+shou4
+shu1
+shu2
+shu3
+shu4
+shua1
+shua3
+shua4
+shuai1
+shuai3
+shuai4
+shuan1
+shuan4
+shuang1
+shuang3
+shuang4
+shui
+shui2
+shui3
+shui4
+shun3
+shun4
+shuo1
+shuo2
+shuo4
+si
+si1
+si2
+si3
+si4
+song1
+song2
+song3
+song4
+sou1
+sou3
+sou4
+su1
+su2
+su3
+su4
+suan1
+suan3
+suan4
+sui1
+sui2
+sui3
+sui4
+sun1
+sun3
+sun4
+suo
+suo1
+suo2
+suo3
+suo4
+ta
+ta1
+ta2
+ta3
+ta4
+tai
+tai1
+tai2
+tai3
+tai4
+tan1
+tan2
+tan3
+tan4
+tang1
+tang2
+tang3
+tang4
+tao1
+tao2
+tao3
+tao4
+te
+te4
+tei1
+teng1
+teng2
+teng4
+ti
+ti1
+ti2
+ti3
+ti4
+tian1
+tian2
+tian3
+tian4
+tiao
+tiao1
+tiao2
+tiao3
+tiao4
+tie1
+tie2
+tie3
+tie4
+ting1
+ting2
+ting3
+ting4
+tong1
+tong2
+tong3
+tong4
+tou
+tou1
+tou2
+tou3
+tou4
+tu
+tu1
+tu2
+tu3
+tu4
+tuan1
+tuan2
+tuan3
+tuan4
+tui1
+tui2
+tui3
+tui4
+tun1
+tun2
+tun3
+tun4
+tuo1
+tuo2
+tuo3
+tuo4
+wa
+wa1
+wa2
+wa3
+wa4
+wai
+wai1
+wai3
+wai4
+wan1
+wan2
+wan3
+wan4
+wang1
+wang2
+wang3
+wang4
+wei
+wei1
+wei2
+wei3
+wei4
+wen
+wen1
+wen2
+wen3
+wen4
+weng1
+weng3
+weng4
+wo1
+wo3
+wo4
+wong4
+wu
+wu1
+wu2
+wu3
+wu4
+xi1
+xi2
+xi3
+xi4
+xia1
+xia2
+xia3
+xia4
+xian
+xian1
+xian2
+xian3
+xian4
+xiang1
+xiang2
+xiang3
+xiang4
+xiao
+xiao1
+xiao2
+xiao3
+xiao4
+xie1
+xie2
+xie3
+xie4
+xin
+xin1
+xin2
+xin3
+xin4
+xing
+xing1
+xing2
+xing3
+xing4
+xiong1
+xiong2
+xiong3
+xiong4
+xiu1
+xiu2
+xiu3
+xiu4
+xu
+xu1
+xu2
+xu3
+xu4
+xuan1
+xuan2
+xuan3
+xuan4
+xue1
+xue2
+xue3
+xue4
+xun1
+xun2
+xun4
+ya
+ya1
+ya2
+ya3
+ya4
+yan1
+yan2
+yan3
+yan4
+yang
+yang1
+yang2
+yang3
+yang4
+yao1
+yao2
+yao3
+yao4
+ye
+ye1
+ye2
+ye3
+ye4
+yi
+yi1
+yi2
+yi3
+yi4
+yin
+yin1
+yin2
+yin3
+yin4
+ying1
+ying2
+ying3
+ying4
+yo
+yo1
+yong1
+yong2
+yong3
+yong4
+you
+you1
+you2
+you3
+you4
+yu
+yu1
+yu2
+yu3
+yu4
+yuan1
+yuan2
+yuan3
+yuan4
+yue1
+yue2
+yue3
+yue4
+yun
+yun1
+yun2
+yun3
+yun4
+za1
+za2
+za3
+za4
+zai1
+zai3
+zai4
+zan
+zan1
+zan2
+zan3
+zan4
+zang1
+zang3
+zang4
+zao1
+zao2
+zao3
+zao4
+ze
+ze2
+ze4
+zei2
+zen
+zen1
+zen3
+zen4
+zeng1
+zeng3
+zeng4
+zha
+zha1
+zha2
+zha3
+zha4
+zhai1
+zhai2
+zhai3
+zhai4
+zhan1
+zhan2
+zhan3
+zhan4
+zhang
+zhang1
+zhang3
+zhang4
+zhao
+zhao1
+zhao2
+zhao3
+zhao4
+zhe
+zhe1
+zhe2
+zhe3
+zhe4
+zhei4
+zhen1
+zhen2
+zhen3
+zhen4
+zheng1
+zheng3
+zheng4
+zhi
+zhi1
+zhi2
+zhi3
+zhi4
+zhong1
+zhong3
+zhong4
+zhou1
+zhou2
+zhou3
+zhou4
+zhu1
+zhu2
+zhu3
+zhu4
+zhua1
+zhua3
+zhuai1
+zhuai3
+zhuai4
+zhuan1
+zhuan2
+zhuan3
+zhuan4
+zhuang1
+zhuang3
+zhuang4
+zhui1
+zhui3
+zhui4
+zhun1
+zhun3
+zhun4
+zhuo
+zhuo1
+zhuo2
+zhuo4
+zi
+zi1
+zi2
+zi3
+zi4
+zong
+zong1
+zong3
+zong4
+zou1
+zou3
+zou4
+zu1
+zu2
+zu3
+zu4
+zuan1
+zuan3
+zuan4
+zui
+zui1
+zui2
+zui3
+zui4
+zun1
+zun2
+zun3
+zun4
+zuo
+zuo1
+zuo2
+zuo3
+zuo4
+ê1
+ê2
+ê3
+ê4

egs/zipvoice/local/prepare_emilia.sh ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env bash
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export PYTHONPATH=../../:$PYTHONPATH
+set -eou pipefail
+stage=0
+stop_stage=5
+sampling_rate=24000
+nj=32
+dl_dir=$PWD/download
+. scripts/parse_options.sh || exit 1
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+log "dl_dir: $dl_dir"
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+  # Your download directory should look like this:
+  #
+  #    download/Amphion___Emilia
+  #    ├── metafile.yaml
+  #    ├── raw
+  #    │   ├── DE
+  #    │   ├── EN
+  #    │   ├── FR
+  #    │   ├── JA
+  #    │   ├── KO
+  #    │   ├── openemilia_45batches.tar.gz
+  #    │   ├── openemilia_all.tar.gz
+  #    │   └── ZH
+  #    └── README.md
+  if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
+    log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
+    exit(-1)
+  fi
+fi
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare emilia manifests (EN and ZH only)"
+  # We assume that you have downloaded the Emilia corpus
+  # to $dl_dir/Amphion___Emilia
+  # see stage 0 for the directory structure
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.emilia.done ]; then
+    lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
+    lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
+    touch data/manifests/.emilia.done
+  fi
+fi
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
+  mkdir -p data/manifests/splits_raw
+  if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
+    lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
+    lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
+    touch data/manifests/splits_raw/.emilia.split.done
+  fi
+  mkdir -p data/manifests/splits
+  if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
+    python local/preprocess_emilia.py --subset EN
+    python local/preprocess_emilia.py --subset ZH
+    touch data/manifests/splits/.emilia.preprocess.done
+  fi
+fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Add tokens to manifests"
+  mkdir -p data/manifests/tokenized_splits
+  if [ ! -e data/manifests/tokenized_splits/.emilia.preprocess.done ]; then
+    for subset in EN ZH; do
+      log "Tokenizing Emilia ${subset}"
+      python local/prepare_emilia.py \
+        --subset ${subset} \
+        --jobs ${nj} \
+        --source-dir data/manifests/splits/ \
+        --output-dir data/manifests/tokenized_splits/
+    done
+    touch data/manifests/tokenized_splits/.emilia.preprocess.done
+  fi
+fi
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Extract Fbank for Emilia"
+  mkdir -p data/fbank/emilia_splits
+  if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
+    # You can speed up the extraction by distributing splits to multiple machines.
+    for subset in EN ZH; do
+      python3 -m zipvoice.bin.compute_fbank \
+        --source-dir data/manifests/tokenized_splits \
+        --dest-dir data/fbank/emilia_splits \
+        --dataset emilia \
+        --subset ${subset} \
+        --splits-cuts 1 \
+        --split-begin 0 \
+        --split-end 2000 \
+        --num-jobs ${nj}
+    done
+    touch data/fbank/emilia_splits/.emilia.fbank.done
+  fi
+  if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
+    log "Combining EN fbank cuts and spliting EN dev set"
+    gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
+    head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
+    sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
+    gzip data/fbank/emilia_cuts_EN.jsonl
+  fi
+  if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
+    log "Combining ZH fbank cuts and spliting ZH dev set"
+    gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
+    head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
+    sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
+    gzip data/fbank/emilia_cuts_ZH.jsonl
+  fi
+fi
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Generate token file"
+  if [ ! -e data/tokens_emilia.txt ]; then
+    ./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
+  fi
+fi

egs/zipvoice/local/prepare_libritts.sh ADDED Viewed

	@@ -0,0 +1,100 @@

+#!/usr/bin/env bash
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export PYTHONPATH=../../:$PYTHONPATH
+set -eou pipefail
+stage=0
+stop_stage=5
+sampling_rate=24000
+nj=20
+dl_dir=$PWD/download
+. utils/parse_options.sh || exit 1
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+log "dl_dir: $dl_dir"
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+  # If you have pre-downloaded it to /path/to/LibriTTS,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
+  #
+  if [ ! -d $dl_dir/LibriTTS ]; then
+    lhotse download libritts $dl_dir
+  fi
+fi
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare LibriTTS manifest"
+  # We assume that you have downloaded the LibriTTS corpus
+  # to $dl_dir/LibriTTS
+  # We did not add tokens to this manifest, as on-the-fly
+  # tokenization with LibriTTSTokenizer is not slow.
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.libritts.done ]; then
+    lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
+    touch data/manifests/.libritts.done
+  fi
+fi
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Compute Fbank for LibriTTS"
+  mkdir -p data/fbank
+  if [ ! -e data/fbank/.libritts.done ]; then
+    for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
+      python3 -m zipvoice.bin.compute_fbank \
+        --source-dir data/manifests \
+        --dest-dir data/fbank \
+        --dataset libritts \
+        --subset ${subset} \
+        --sampling-rate $sampling_rate \
+        --num-jobs ${nj}
+    done
+    touch data/fbank/.libritts.done
+  fi
+  # Here we shuffle and combine the train-clean-100, train-clean-360 and
+  # train-other-500 together to form the training set.
+  if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
+    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
+      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
+      <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
+      shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
+  fi
+  if [ ! -e data/fbank/.libritts-validated.done ]; then
+    log "Validating data/fbank for LibriTTS"
+    python3 ./utils/validate_manifest.py \
+      data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
+    touch data/fbank/.libritts-validated.done
+  fi
+fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Generate token file"
+  if [ ! -e data/tokens_libritts.txt ]; then
+    python3 ./local/prepare_token_file_char.py \
+      --manifest data/fbank/libritts_cuts_train-all-shuf.jsonl.gz \
+      --tokens data/tokens_libritts.txt
+  fi
+fi

egs/zipvoice/local/prepare_token_file_char.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+# Copyright    2024-2025  Xiaomi Corp.        (authors: Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import re
+from collections import Counter
+from pathlib import Path
+from lhotse import load_manifest_lazy
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tokens",
+        type=Path,
+        help="Path to the dict that maps the text tokens to IDs",
+    )
+    parser.add_argument(
+        "--manifest",
+        type=Path,
+        help="Path to the manifest file",
+    )
+    return parser.parse_args()
+def prepare_tokens(manifest_file, token_file):
+    counter = Counter()
+    manifest = load_manifest_lazy(manifest_file)
+    for cut in manifest:
+        line = re.sub(r"\s+", " ", cut.supervisions[0].text)
+        counter.update(line)
+    unique_chars = set(counter.keys())
+    if "_" in unique_chars:
+        unique_chars.remove("_")
+    sorted_chars = sorted(unique_chars, key=lambda char: counter[char], reverse=True)
+    result = ["_"] + sorted_chars
+    with open(token_file, "w", encoding="utf-8") as file:
+        for index, char in enumerate(result):
+            file.write(f"{char}\t{index}\n")
+if __name__ == "__main__":
+    args = get_args()
+    prepare_tokens(args.manifest, args.tokens)

egs/zipvoice/local/prepare_token_file_emilia.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#!/usr/bin/env python3
+# Copyright         2024  Xiaomi Corp.        (authors: Zengwei Yao,
+#                                                       Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file generates the file that maps tokens to IDs.
+"""
+import argparse
+import logging
+from pathlib import Path
+from typing import List
+from piper_phonemize import get_espeak_map
+from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tokens",
+        type=Path,
+        default=Path("data/tokens_emilia.txt"),
+        help="Path to the dict that maps the text tokens to IDs",
+    )
+    parser.add_argument(
+        "--pinyin",
+        type=Path,
+        default=Path("resources/pinyin.txt"),
+        help="Path to the all unique pinyin",
+    )
+    return parser.parse_args()
+def get_pinyin_tokens(pinyin: Path) -> List[str]:
+    phones = set()
+    with open(pinyin, "r") as f:
+        for line in f:
+            x = line.strip()
+            initial = to_initials(x, strict=False)
+            # don't want to share tokens with espeak tokens, so use tone3 style
+            finals = to_finals_tone3(x, strict=False, neutral_tone_with_five=True)
+            if initial != "":
+                # don't want to share tokens with espeak tokens,
+                # so add a '0' after each initial
+                phones.add(initial + "0")
+            if finals != "":
+                phones.add(finals)
+    return sorted(phones)
+def get_token2id(args):
+    """Get a dict that maps token to IDs, and save it to the given filename."""
+    all_tokens = get_espeak_map()  # token: [token_id]
+    all_tokens = {token: token_id[0] for token, token_id in all_tokens.items()}
+    # sort by token_id
+    all_tokens = sorted(all_tokens.items(), key=lambda x: x[1])
+    all_pinyin = get_pinyin_tokens(args.pinyin)
+    with open(args.tokens, "w", encoding="utf-8") as f:
+        for token, token_id in all_tokens:
+            f.write(f"{token}\t{token_id}\n")
+        num_espeak_tokens = len(all_tokens)
+        for i, pinyin in enumerate(all_pinyin):
+            f.write(f"{pinyin}\t{num_espeak_tokens + i}\n")
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_args()
+    get_token2id(args)

egs/zipvoice/local/prepare_tokens_emilia.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+This file reads the texts in given manifest and save the new cuts with phoneme tokens.
+"""
+import argparse
+import glob
+import logging
+from concurrent.futures import ProcessPoolExecutor as Pool
+from pathlib import Path
+from lhotse import load_manifest_lazy
+from zipvoice.tokenizer.tokenizer import add_tokens
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--subset",
+        type=str,
+        help="Subset of emilia, (ZH, EN, etc.)",
+    )
+    parser.add_argument(
+        "--jobs",
+        type=int,
+        default=50,
+        help="Number of jobs to processing.",
+    )
+    parser.add_argument(
+        "--source-dir",
+        type=str,
+        default="data/manifests/splits",
+        help="The source directory of manifest files.",
+    )
+    parser.add_argument(
+        "--dest-dir",
+        type=str,
+        help="The destination directory of manifest files.",
+    )
+    return parser.parse_args()
+def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
+    logging.info(f"Processing {file_name}")
+    if (output_dir / file_name).is_file():
+        logging.info(f"{file_name} exists, skipping.")
+        return
+    try:
+        cut_set = load_manifest_lazy(input_dir / file_name)
+        cut_set = add_tokens(cut_set=cut_set, tokenizer="emilia")
+        cut_set.to_file(output_dir / file_name)
+    except Exception as e:
+        logging.error(f"Manifest {file_name} failed with error: {e}")
+        raise
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_args()
+    input_dir = Path(args.source_dir)
+    output_dir = Path(args.dest_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cut_files = glob.glob(f"{args.source_dir}/emilia_cuts_{args.subset}.*.jsonl.gz")
+    with Pool(max_workers=args.jobs) as pool:
+        futures = [
+            pool.submit(
+                prepare_tokens_emilia, filename.split("/")[-1], input_dir, output_dir
+            )
+            for filename in cut_files
+        ]
+        for f in futures:
+            try:
+                f.result()
+                f.done()
+            except Exception as e:
+                logging.error(f"Future failed with error: {e}")
+    logging.info("Processing done.")

egs/zipvoice/local/preprocess_emilia.py ADDED Viewed

	@@ -0,0 +1,210 @@

+#!/usr/bin/env python3
+# Copyright     2024-2025  Xiaomi Corp.        (authors: Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file reads the texts in given manifest and save the cleaned new cuts.
+"""
+import argparse
+import glob
+import logging
+import os
+import re
+import unicodedata
+from concurrent.futures import ProcessPoolExecutor as Pool
+from pathlib import Path
+from lhotse import load_manifest_lazy
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--subset",
+        type=str,
+        help="Subset of emilia, (ZH, EN, etc.)",
+    )
+    parser.add_argument(
+        "--jobs",
+        type=int,
+        default=20,
+        help="Number of jobs to processing.",
+    )
+    parser.add_argument(
+        "--source-dir",
+        type=str,
+        default="data/manifests/splits_raw",
+        help="The source directory of manifest files.",
+    )
+    parser.add_argument(
+        "--dest-dir",
+        type=str,
+        default="data/manifests/splits",
+        help="The destination directory of manifest files.",
+    )
+    return parser.parse_args()
+def tokenize_by_CJK_char(text: str) -> str:
+    """
+    Tokenize a line of text with CJK char.
+    Example:
+    input = "你好世界是 hello world 的中文"
+    output = ["你", "好", "世", "界", "是", "hello", "world", "的", "中", "文"]
+    """
+    pattern = re.compile(
+        r"([\u1100-\u11ff"
+        r"\u2e80-\ua4cf"
+        r"\ua840-\uD7AF"
+        r"\uF900-\uFAFF"
+        r"\uFE30-\uFE4F"
+        r"\uFF65-\uFFDC"
+        r"\U00020000-\U0002FFFF])"
+    )
+    chars = pattern.split(text.strip())
+    merged = " ".join([w.strip() for w in chars if w.strip()])
+    return merged.split()
+def is_hangul(char):
+    letters = unicodedata.normalize("NFD", char)
+    return all(
+        ["\u1100" <= c <= "\u11ff" or "\u3131" <= c <= "\u318e" for c in letters]
+    )
+def is_japanese(char):
+    return any(
+        [
+            start <= char <= end
+            for start, end in [
+                ("\u3041", "\u3096"),
+                ("\u30a0", "\u30ff"),
+                ("\uff5f", "\uff9f"),
+                ("\u31f0", "\u31ff"),
+                ("\u3220", "\u3243"),
+                ("\u3280", "\u337f"),
+            ]
+        ]
+    )
+def is_chinese(char):
+    if char >= "\u4e00" and char <= "\u9fa5":
+        return True
+    else:
+        return False
+def is_alphabet(char):
+    if (char >= "\u0041" and char <= "\u005a") or (
+        char >= "\u0061" and char <= "\u007a"
+    ):
+        return True
+    else:
+        return False
+def preprocess_emilia(file_name: str, input_dir: Path, output_dir: Path):
+    logging.info(f"Processing {file_name}")
+    if (output_dir / file_name).is_file():
+        logging.info(f"{file_name} exists, skipping.")
+        return
+    def _filter_cut(cut):
+        text = cut.supervisions[0].text
+        duration = cut.supervisions[0].duration
+        chinese = []
+        english = []
+        # only contains chinese and space and alphabets
+        clean_chars = []
+        for x in text:
+            if is_hangul(x):
+                logging.warning(f"Delete cut with text containing Korean : {text}")
+                return False
+            if is_japanese(x):
+                logging.warning(f"Delete cut with text containing Japanese : {text}")
+                return False
+            if is_chinese(x):
+                chinese.append(x)
+                clean_chars.append(x)
+            if is_alphabet(x):
+                english.append(x)
+                clean_chars.append(x)
+            if x == " ":
+                clean_chars.append(x)
+        if len(english) + len(chinese) == 0:
+            logging.warning(f"Delete cut with text has no valid chars : {text}")
+            return False
+        words = tokenize_by_CJK_char("".join(clean_chars))
+        for i in range(len(words) - 10):
+            if words[i : i + 10].count(words[i]) == 10:
+                logging.warning(f"Delete cut with text with too much repeats : {text}")
+                return False
+        # word speed, 20 - 600 / minute
+        if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60:
+            logging.warning(
+                f"Delete cut with audio text mismatch, duration : {duration}s, "
+                f"words : {len(words)}, text : {text}"
+            )
+            return False
+        return True
+    try:
+        cut_set = load_manifest_lazy(input_dir / file_name)
+        cut_set = cut_set.filter(_filter_cut)
+        cut_set.to_file(output_dir / file_name)
+    except Exception as e:
+        logging.error(f"Manifest {file_name} failed with error: {e}")
+        os.remove(str(output_dir / file_name))
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_args()
+    input_dir = Path(args.source_dir)
+    output_dir = Path(args.dest_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cut_files = glob.glob(f"{args.source_dir}/emilia_cuts_{args.subset}.*.jsonl.gz")
+    with Pool(max_workers=args.jobs) as pool:
+        futures = [
+            pool.submit(
+                preprocess_emilia,
+                filename.split("/")[-1],
+                input_dir,
+                output_dir,
+            )
+            for filename in cut_files
+        ]
+        for f in futures:
+            f.result()
+            f.done()
+    logging.info("Processing done.")

egs/zipvoice/run_custom.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/bin/bash
+# This script is an example of training ZipVoice on your custom datasets from scratch.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on:
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=6
+# Number of jobs for data preparation
+nj=20
+# You can set `train_hours` and `max_len` according to statistics from
+# the command `lhotse cut describe data/fbank/custom_cuts_train.jsonl.gz`.
+# Set `train_hours` to "Total speech duration", and set `max_len` to 99% duration.
+# Number of hours in training set, will affect the learning rate schedule
+train_hours=500
+# Maximum length (seconds) of the training utterance, will filter out longer utterances
+max_len=20
+# We suppose you have two TSV files: "data/raw/custom_train.tsv" and
+# "data/raw/custom_dev.tsv", where "custom" is your dataset name,
+# "train"/"dev" are used for training and validation respectively.
+# Each line of the TSV files should be in one of the following formats:
+# (1) `{uniq_id}\t{text}\t{wav_path}` if the text corresponds to the full wav,
+# (2) `{uniq_id}\t{text}\t{wav_path}\t{start_time}\t{end_time}` if text corresponds
+#     to part of the wav. The start_time and end_time specify the start and end
+#     times of the text within the wav, which should be in seconds.
+# > Note: {uniq_id} must be unique for each line.
+for subset in train dev;do
+      file_path=data/raw/custom_${subset}.tsv
+      [ -f "$file_path" ] || { echo "Error: expect $file_path !" >&2; exit 1; }
+done
+### Prepare the training data (1 - 3)
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Prepare manifests for custom dataset from tsv files"
+      for subset in train dev;do
+            python3 -m zipvoice.bin.prepare_dataset \
+                  --tsv-path data/raw/custom_${subset}.tsv \
+                  --prefix custom \
+                  --subset ${subset} \
+                  --num-jobs ${nj} \
+                  --output-dir data/manifests
+      done
+      # The output manifest files are "data/manifests/custom_cuts_train.jsonl.gz".
+      # and "data/manifests/custom_cuts_dev.jsonl.gz".
+      # We did not add tokens to the manifests, as on-the-fly tokenization
+      # with the simple tokenizer used in this example is not slow.
+      # If you change to a complex tokenizer, e.g., with g2p and heavy text normalization,
+      # you may need to add tokens to the manifests to speed up the training.
+      # Refer to the fine-tuning example for adding tokens to the manifests.
+fi
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+      echo "Stage 2: Compute Fbank for custom dataset"
+      # You can skip this step and use `--on-the-fly-feats 1` in training stage
+      for subset in train dev; do
+            python3 -m zipvoice.bin.compute_fbank \
+                  --source-dir data/manifests \
+                  --dest-dir data/fbank \
+                  --dataset custom \
+                  --subset ${subset} \
+                  --num-jobs ${nj}
+      done
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+      echo "Stage 3: Prepare tokens file for custom dataset"
+      # In this example, we use the simplest tokenizer that
+      #     treat every character as a token.
+      python3 ./local/prepare_token_file_char.py \
+            --manifest data/manifests/custom_cuts_train.jsonl.gz \
+            --tokens data/tokens_custom.txt
+fi
+### Training (4 - 5)
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Train the ZipVoice model"
+      [ -z "$train_hours" ] && { echo "Error: train_hours is not set!" >&2; exit 1; }
+      [ -z "$max_len" ] && { echo "Error: max_len is not set!" >&2; exit 1; }
+      # lr-hours will be set according to the `train_hours`,
+      # i.e., lr_hours = 1000 * (train_hours ** 0.3).
+      lr_hours=$(python3 -c "print(round(1000 * ($train_hours ** 0.3)))" )
+      python3 -m zipvoice.bin.train_zipvoice \
+            --world-size 4 \
+            --use-fp16 1 \
+            --num-iters 60000 \
+            --max-duration 500 \
+            --lr-hours ${lr_hours} \
+            --max-len ${max_len} \
+            --model-config conf/zipvoice_base.json \
+            --tokenizer simple \
+            --token-file data/tokens_custom.txt \
+            --dataset custom \
+            --train-manifest data/fbank/custom_cuts_train.jsonl.gz \
+            --dev-manifest data/fbank/custom_cuts_dev.jsonl.gz \
+            --exp-dir exp/zipvoice_custom
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Average the checkpoints for ZipVoice"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --iter 60000 \
+            --avg 2 \
+            --model-name zipvoice \
+            --exp-dir exp/zipvoice_custom
+      # The generated model is exp/zipvoice_custom/iter-60000-avg-2.pt
+fi
+### Inference with PyTorch models (6)
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+      echo "Stage 6: Inference of the ZipVoice model"
+      python3 -m zipvoice.bin.infer_zipvoice \
+            --model-name zipvoice \
+            --model-dir exp/zipvoice_custom \
+            --checkpoint-name iter-60000-avg-2.pt \
+            --tokenizer simple \
+            --test-list test.tsv \
+            --res-dir results/test_custom
+fi

egs/zipvoice/run_emilia.sh ADDED Viewed

	@@ -0,0 +1,178 @@

+#!/bin/bash
+# This is an example script for training ZipVoice on Emilia dataset.
+# This script covers data preparation, ZipVoice trainnig,
+#     ZipVoice-Distill training, onnx export, and
+#     inference with all PyTorch and ONNX models.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=12
+#### Prepare datasets (1)
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Data Preparation for Emilia dataset"
+      bash local/prepare_emilia.sh
+fi
+### Training ZipVoice (2 - 3)
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+      echo "Stage 2: Train the ZipVoice model"
+      python3 -m zipvoice.bin.train_zipvoice \
+            --world-size 8 \
+            --use-fp16 1 \
+            --num-epochs 11 \
+            --max-duration 500 \
+            --lr-hours 30000 \
+            --model-config conf/zipvoice_base.json \
+            --tokenizer emilia \
+            --token-file data/tokens_emilia.txt \
+            --dataset emilia \
+            --manifest-dir data/fbank \
+            --exp-dir exp/zipvoice
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+      echo "Stage 3: Average the checkpoints for ZipVoice"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --epoch 11 \
+            --avg 4 \
+            --model-name zipvoice \
+            --exp-dir exp/zipvoice
+      # The generated model is exp/zipvoice/epoch-11-avg-4.pt
+fi
+#### (Optional) Training ZipVoice-Distill model (4 - 6)
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Train the ZipVoice-Distill model (first stage)"
+      python3 -m zipvoice.bin.train_zipvoice_distill \
+            --world-size 8 \
+            --use-fp16 1 \
+            --num-iters 60000 \
+            --max-duration 500 \
+            --base-lr 0.0005 \
+            --tokenizer emilia \
+            --token-file data/tokens_emilia.txt \
+            --dataset emilia \
+            --manifest-dir data/fbank \
+            --teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \
+            --distill-stage first \
+            --exp-dir exp/zipvoice_distill_1stage
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Average the checkpoints for ZipVoice-Distill (first stage)"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --iter 60000 \
+            --avg 7 \
+            --model-name zipvoice_distill \
+            --exp-dir exp/zipvoice_distill_1stage
+      # The generated model is exp/zipvoice_distill_1stage/iter-60000-avg-7.pt
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+      echo "Stage 6: Train the ZipVoice-Distill model (second stage)"
+      python3 -m zipvoice.bin.train_zipvoice_distill \
+            --world-size 8 \
+            --use-fp16 1 \
+            --num-iters 2000 \
+            --save-every-n 1000 \
+            --max-duration 500 \
+            --base-lr 0.0001 \
+            --model-config conf/zipvoice_base.json \
+            --tokenizer emilia \
+            --token-file data/tokens_emilia.txt \
+            --dataset emilia \
+            --manifest-dir data/fbank \
+            --teacher-model exp/zipvoice_distill_1stage/iter-60000-avg-7.pt \
+            --distill-stage second \
+            --exp-dir exp/zipvoice_distill
+fi
+### Export ONNX model (7 - 8)
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+      echo "Stage 7: Export ZipVoice ONNX model"
+      python3 -m zipvoice.bin.onnx_export \
+            --model-name zipvoice \
+            --model-dir exp/zipvoice/ \
+            --checkpoint-name epoch-11-avg-4.pt \
+            --onnx-model-dir exp/zipvoice/
+fi
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+      echo "Stage 8: Export ZipVoice-Distill ONNX model"
+      python3 -m zipvoice.bin.onnx_export \
+            --model-name zipvoice_distill \
+            --model-dir exp/zipvoice_distill/ \
+            --checkpoint-name checkpoint-2000.pt \
+            --onnx-model-dir exp/zipvoice_distill/
+fi
+### Inference with PyTorch and ONNX models (9 - 12)
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+      echo "Stage 9: Inference of the ZipVoice model"
+      python3 -m zipvoice.bin.infer_zipvoice \
+            --model-name zipvoice \
+            --model-dir exp/zipvoice/ \
+            --checkpoint-name epoch-11-avg-4.pt \
+            --tokenizer emilia \
+            --test-list test.tsv \
+            --res-dir results/test \
+            --num-step 16 \
+            --guidance-scale 1
+fi
+if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+      echo "Stage 10: Inference of the ZipVoice-Distill model"
+      python3 -m zipvoice.bin.infer_zipvoice \
+            --model-name zipvoice_distill \
+            --model-dir exp/zipvoice_distill/ \
+            --checkpoint-name checkpoint-2000.pt \
+            --tokenizer emilia \
+            --test-list test.tsv \
+            --res-dir results/test_distill \
+            --num-step 8 \
+            --guidance-scale 3
+fi
+if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+      echo "Stage 11: Inference with ZipVoice ONNX model"
+      python3 -m zipvoice.bin.infer_zipvoice_onnx \
+            --model-name zipvoice \
+            --onnx-int8 False \
+            --model-dir exp/zipvoice \
+            --tokenizer emilia \
+            --test-list test.tsv \
+            --res-dir results/test_onnx
+fi
+if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+      echo "Stage 12: Inference with ZipVoic-Distill ONNX model"
+      python3 -m zipvoice.bin.infer_zipvoice_onnx \
+            --model-name zipvoice_distill \
+            --onnx-int8 False \
+            --model-dir exp/zipvoice_distill \
+            --tokenizer emilia \
+            --test-list test.tsv \
+            --res-dir results/test_distill_onnx
+fi

egs/zipvoice/run_eval.sh ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/bin/bash
+# This script is an example of evaluate TTS models with objective metrics reported in ZipVoice paper.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on:
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=7
+download_dir=download/
+# Uncomment this line to use HF mirror
+# export HF_ENDPOINT=https://hf-mirror.com
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Download test sets (LibriSpeech-PC and Seed-TTS)"
+      hf_repo=k2-fsa/TTS_eval_datasets
+      mkdir -p ${download_dir}/
+      for file in librispeech_pc_testset.tar.gz seedtts_testset.tar.gz; do
+            echo "Downloading ${file}..."
+            huggingface-cli download \
+                  --repo-type dataset \
+                  --local-dir ${download_dir}/ \
+                  ${hf_repo} \
+                  ${file}
+            echo "Extracting ${file}..."
+            tar -xzf ${download_dir}/${file} -C ${download_dir}/
+      done
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+      echo "Stage 2: Download all required evaluation models"
+      hf_repo=k2-fsa/TTS_eval_models
+      mkdir -p ${download_dir}/tts_eval_models
+      huggingface-cli download \
+        --local-dir ${download_dir}/tts_eval_models \
+        ${hf_repo}
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+      echo "Stage 3: Inference with the pre-trained ZipVoice model from huggingface"
+      for testset in librispeech_pc seedtts_en seedtts_zh; do
+        if [ "$testset" = "librispeech_pc" ]; then
+                test_tsv=${download_dir}/librispeech_pc_testset/test.tsv
+        elif [ "$testset" = "seedtts_en" ]; then
+                test_tsv=${download_dir}/seedtts_testset/en/test.tsv
+        elif [ "$testset" = "seedtts_zh" ]; then
+                test_tsv=${download_dir}/seedtts_testset/zh/test.tsv
+        else
+                echo "Error: unknown testset ${testset}" >&2
+                exit 1
+        fi
+        echo "Inference on tetset ${testset}..."
+        python3 -m zipvoice.bin.infer_zipvoice \
+                --model-name zipvoice \
+                --test-list ${test_tsv} \
+                --res-dir results/${testset}
+      done
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Evaluation on LibriSpeech-PC"
+      model_path=${download_dir}/tts_eval_models
+      wav_path=results/librispeech_pc
+      test_tsv=${download_dir}/librispeech_pc_testset/test.tsv
+      # Use LibriSpeech style transcripts for WER evaluation
+      transcript_tsv=${download_dir}/librispeech_pc_testset/transcript.tsv
+      python3 -m zipvoice.eval.speaker_similarity.sim \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path}
+      python3 -m zipvoice.eval.wer.hubert \
+            --wav-path ${wav_path} \
+            --test-list ${transcript_tsv} \
+            --model-dir ${model_path}
+      python3 -m zipvoice.eval.mos.utmos \
+            --wav-path ${wav_path} \
+            --model-dir ${model_path}
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Evaluation on Seed-TTS test en"
+      model_path=${download_dir}/tts_eval_models
+      wav_path=results/seedtts_en
+      test_tsv=${download_dir}/seedtts_testset/en/test.tsv
+      python3 -m zipvoice.eval.speaker_similarity.sim \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path}
+      python3 -m zipvoice.eval.wer.seedtts \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path} \
+            --lang en
+      python3 -m zipvoice.eval.mos.utmos \
+            --wav-path ${wav_path} \
+            --model-dir ${model_path}
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+      echo "Stage 6: Evaluation on Seed-TTS test en"
+      model_path=${download_dir}/tts_eval_models
+      wav_path=results/seedtts_zh
+      test_tsv=${download_dir}/seedtts_testset/zh/test.tsv
+      python3 -m zipvoice.eval.speaker_similarity.sim \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path}
+      python3 -m zipvoice.eval.wer.seedtts \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path} \
+            --lang zh
+      python3 -m zipvoice.eval.mos.utmos \
+            --wav-path ${wav_path} \
+            --model-dir ${model_path}
+fi

egs/zipvoice/run_finetune.sh ADDED Viewed

	@@ -0,0 +1,175 @@

+#!/bin/bash
+# This script is an example of fine-tuning ZipVoice on your custom datasets.
+# Add project root to PYTHONPATH
+# export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on:
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=6
+# Number of jobs for data preparation
+nj=4
+# Whether the language of training data is one of Chinese and English
+is_zh_en=0
+# Language identifier, used when language is not Chinese or English
+# see https://github.com/rhasspy/espeak-ng/blob/master/docs/languages.md
+# Example of French: lang=fr
+lang=vi
+if [ $is_zh_en -eq 1 ]; then
+      tokenizer=espeak
+else
+      tokenizer=espeak
+      [ "$lang" = "default" ] && { echo "Error: lang is not set!" >&2; exit 1; }
+fi
+# You can set `max_len` according to statistics from the command
+# `lhotse cut describe data/fbank/custom_cuts_train.jsonl.gz`.
+# Set `max_len` to 99% duration.
+# Maximum length (seconds) of the training utterance, will filter out longer utterances
+max_len=25
+# Download directory for pre-trained models
+download_dir=download
+# We suppose you have two TSV files: "data/raw/custom_train.tsv" and
+# "data/raw/custom_dev.tsv", where "custom" is your dataset name,
+# "train"/"dev" are used for training and validation respectively.
+# Each line of the TSV files should be in one of the following formats:
+# (1) `{uniq_id}\t{text}\t{wav_path}` if the text corresponds to the full wav,
+# (2) `{uniq_id}\t{text}\t{wav_path}\t{start_time}\t{end_time}` if text corresponds
+#     to part of the wav. The start_time and end_time specify the start and end
+#     times of the text within the wav, which should be in seconds.
+# > Note: {uniq_id} must be unique for each line.
+# for subset in train dev;do
+#       file_path=data/raw/custom_${subset}.tsv
+#       [ -f "$file_path" ] || { echo "Error: expect $file_path !" >&2; exit 1; }
+# done
+# ### Prepare the training data (1 - 4)
+# if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+#       echo "Stage 1: Prepare manifests for custom dataset from tsv files"
+#       for subset in train dev;do
+#             python3 -m zipvoice.bin.prepare_dataset \
+#                   --tsv-path data/raw/custom_${subset}.tsv \
+#                   --prefix custom-finetune \
+#                   --subset raw_${subset} \
+#                   --num-jobs ${nj} \
+#                   --output-dir data/manifests
+#       done
+#       # The output manifest files are "data/manifests/custom-finetune_cuts_raw_train.jsonl.gz".
+#       # and "data/manifests/custom-finetune_cuts_raw_dev.jsonl.gz".
+# fi
+# if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+#       echo "Stage 2: Add tokens to manifests"
+#       # For "emilia" and "espeak" tokenizers, it's better to prepare the tokens
+#       # before training. Otherwise, the on-the-fly tokenization can significantly
+#       # slow down the training.
+#       for subset in train dev;do
+#             python3 -m zipvoice.bin.prepare_tokens \
+#                   --input-file data/manifests/custom-finetune_cuts_raw_${subset}.jsonl.gz \
+#                   --output-file data/manifests/custom-finetune_cuts_${subset}.jsonl.gz \
+#                   --tokenizer ${tokenizer} \
+#                   --lang ${lang}
+#       done
+#       # The output manifest files are "data/manifests/custom-finetune_cuts_train.jsonl.gz".
+#       # and "data/manifests/custom-finetune_cuts_dev.jsonl.gz".
+# fi
+# if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+#       echo "Stage 3: Compute Fbank for custom dataset"
+#       # You can skip this step and use `--on-the-fly-feats 1` in training stage
+#       for subset in train dev; do
+#             python3 -m zipvoice.bin.compute_fbank \
+#                   --source-dir data/manifests \
+#                   --dest-dir data/fbank \
+#                   --dataset custom-finetune \
+#                   --subset ${subset} \
+#                   --num-jobs ${nj}
+#       done
+# fi
+# # if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+# #       echo "Stage 4: Download pre-trained model, tokens file, and model config"
+# #       # Uncomment this line to use HF mirror
+# #       # export HF_ENDPOINT=https://hf-mirror.com
+# #       hf_repo=k2-fsa/ZipVoice
+# #       mkdir -p ${download_dir}
+# #       for file in model.pt tokens.txt model.json; do
+# #             huggingface-cli download \
+# #                   --local-dir ${download_dir} \
+# #                   ${hf_repo} \
+# #                   zipvoice/${file}
+# #       done
+# # fi
+# # ### Training ZipVoice (5 - 6)
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Fine-tune the ZipVoice model"
+      [ -z "$max_len" ] && { echo "Error: max_len is not set!" >&2; exit 1; }
+      python3 -m zipvoice.bin.train_zipvoice \
+            --world-size 1 \
+            --use-fp16 1 \
+            --finetune 1 \
+            --base-lr 0.00006 \
+            --num-epochs 2 \
+            --save-every-n 1000 \
+            --keep-last-k 4 \
+            --max-duration 650 \
+            --max-len ${max_len} \
+            --min-len 1 \
+            --model-config ${download_dir}/zipvoice/model.json \
+            --checkpoint ${download_dir}/zipvoice/model.pt \
+            --tokenizer ${tokenizer} \
+            --lang ${lang} \
+            --token-file ${download_dir}/zipvoice/tokens.txt \
+            --dataset custom \
+            --train-manifest data/fbank/train_all.jsonl.gz \
+            --dev-manifest data/fbank/dev_all.jsonl.gz \
+            --exp-dir exp/zipvoice_finetune
+fi
+# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+#       echo "Stage 6: Average the checkpoints for ZipVoice"
+#       python3 -m zipvoice.bin.generate_averaged_model \
+#             --iter 10000 \
+#             --avg 2 \
+#             --model-name zipvoice \
+#             --exp-dir exp/zipvoice_finetune
+#       # The generated model is exp/zipvoice_finetune/iter-10000-avg-2.pt
+# fi
+# ### Inference with PyTorch models (7)
+# if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+#       echo "Stage 7: Inference of the ZipVoice model"
+#       python3 -m zipvoice.bin.infer_zipvoice \
+#             --model-name zipvoice \
+#             --model-dir exp/zipvoice_finetune/ \
+#             --checkpoint-name iter-10000-avg-2.pt \
+#             --tokenizer ${tokenizer} \
+#             --lang ${lang} \
+#             --test-list test.tsv \
+#             --res-dir results/test_finetune\
+#             --num-step 16
+# fi

egs/zipvoice/run_libritts.sh ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/bin/bash
+# This is an example script for training ZipVoice on LibriTTS dataset.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=9
+#### Prepare datasets (1)
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Data Preparation for LibriTTS dataset"
+      bash local/prepare_libritts.sh
+fi
+### Training ZipVoice (2 - 3)
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+      echo "Stage 2: Train the ZipVoice model"
+      python3 -m zipvoice.bin.train_zipvoice \
+            --world-size 8 \
+            --use-fp16 0 \
+            --num-epochs 60 \
+            --max-duration 250 \
+            --lr-epochs 10 \
+            --max-len 20 \
+            --valid-by-epoch 1 \
+            --model-config conf/zipvoice_base.json \
+            --tokenizer libritts \
+            --token-file data/tokens_libritts.txt \
+            --dataset libritts \
+            --manifest-dir data/fbank \
+            --exp-dir exp/zipvoice_libritts
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+      echo "Stage 3: Average the checkpoints for ZipVoice"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --epoch 60 \
+            --avg 10 \
+            --model-name zipvoice \
+            --exp-dir exp/zipvoice_libritts
+      # The generated model is exp/zipvoice_libritts/epoch-60-avg-10.pt
+fi
+#### (Optional) Training ZipVoice-Distill model (4 - 7)
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Train the ZipVoice-Distill model (first stage)"
+      python3 -m zipvoice.bin.train_zipvoice_distill \
+            --world-size 8 \
+            --use-fp16 0 \
+            --num-epochs 6 \
+            --max-duration 250 \
+            --base-lr 0.001 \
+            --max-len 20 \
+            --valid-by-epoch 1 \
+            --model-config conf/zipvoice_base.json \
+            --tokenizer libritts \
+            --token-file data/tokens_libritts.txt \
+            --dataset "libritts" \
+            --manifest-dir "data/fbank" \
+            --teacher-model exp/zipvoice_libritts/epoch-60-avg-10.pt \
+            --distill-stage "first" \
+            --exp-dir exp/zipvoice_distill_1stage_libritts
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Average the checkpoints for ZipVoice-Distill (first stage)"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --epoch 6 \
+            --avg 3 \
+            --model-name zipvoice_distill \
+            --exp-dir exp/zipvoice_distill_1stage_libritts
+      # The generated model is exp/zipvoice_distill_1stage_libritts/epoch-6-avg-3.pt
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+      echo "Stage 6: Train the ZipVoice-Distill model (second stage)"
+      python3 -m zipvoice.bin.train_zipvoice_distill \
+            --world-size 8 \
+            --use-fp16 1 \
+            --num-epochs 6 \
+            --max-duration 250 \
+            --base-lr 0.001 \
+            --max-len 20 \
+            --valid-by-epoch 1 \
+            --model-config conf/zipvoice_base.json \
+            --tokenizer libritts \
+            --token-file data/tokens_libritts.txt \
+            --dataset libritts \
+            --manifest-dir data/fbank \
+            --teacher-model exp/zipvoice_distill_1stage_libritts/epoch-6-avg-3.pt \
+            --distill-stage second \
+            --exp-dir exp/zipvoice_distill_libritts
+fi
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+      echo "Stage 7: Average the checkpoints for ZipVoice-Distill (second stage)"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --epoch 6 \
+            --avg 3 \
+            --model-name zipvoice_distill \
+            --exp-dir exp/zipvoice_distill_libritts
+      # The generated model is exp/zipvoice_distill_libritts/epoch-6-avg-3.pt
+fi
+### Inference with PyTorch models (8 - 9)
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+      echo "Stage 8: Inference of the ZipVoice model"
+      python3 -m zipvoice.bin.infer_zipvoice \
+            --model-name zipvoice \
+            --model-dir exp/zipvoice_libritts \
+            --checkpoint-name epoch-60-avg-10.pt \
+            --tokenizer libritts \
+            --test-list test.tsv \
+            --res-dir results/test_libritts \
+            --num-step 8 \
+            --guidance-scale 1 \
+            --t-shift 0.7
+fi
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+      echo "Stage 9: Inference of the ZipVoice-Distill model"
+      python3 -m zipvoice.bin.infer_zipvoice \
+            --model-name zipvoice_distill \
+            --model-dir exp/zipvoice_distill_libritts \
+            --checkpoint-name epoch-6-avg-3.pt \
+            --tokenizer libritts \
+            --test-list test.tsv \
+            --res-dir results/test_distill_libritts \
+            --num-step 4 \
+            --guidance-scale 3 \
+            --t-shift 0.7
+fi

egs/zipvoice/utils/parse_options.sh ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.

egs/zipvoice/utils/validate_manifest.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+# Copyright    2022-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script checks the following assumptions of the generated manifest:
+- Single supervision per cut
+We will add more checks later if needed.
+Usage example:
+    python3 ./utils/validate_manifest.py \
+            ./data/spectrogram/ljspeech_cuts_all.jsonl.gz
+"""
+import argparse
+import logging
+from pathlib import Path
+from lhotse import CutSet, load_manifest_lazy
+from lhotse.dataset.speech_synthesis import validate_for_tts
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "manifest",
+        type=Path,
+        help="Path to the manifest file",
+    )
+    return parser.parse_args()
+def main():
+    args = get_args()
+    manifest = args.manifest
+    logging.info(f"Validating {manifest}")
+    assert manifest.is_file(), f"{manifest} does not exist"
+    cut_set = load_manifest_lazy(manifest)
+    assert isinstance(cut_set, CutSet), type(cut_set)
+    validate_for_tts(cut_set)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    main()

egs/zipvoice_dialog/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# ZipVoice-Dialog Recipe
+This recipe contains the following examples:
+- Training ZipVoice-Dialog on OpenDialog dataset, see [run_opendialog.sh](run_opendialog.sh)
+- Training ZipVoice-Dialog on custom datasets (Chinese/English), see [run_custom.sh](run_custom.sh).
+- Fine-tuning pre-trained ZipVoice-Dialog on custom datasets (Chinese/English), see [run_finetune.sh](run_finetune.sh).
+- Evaluate models with objective metrics reported in ZipVoice-Dialog paper, see [run_eval.sh](run_eval.sh).
+>  **NOTE:** For evaluation, first install packages from [../../requirements_eval.txt](../../requirements_eval.txt)
+>
+> `pip install -r ../../requirements_eval.txt`

egs/zipvoice_dialog/local/prepare_opendialog.py ADDED Viewed

	@@ -0,0 +1,262 @@

+#!/usr/bin/env python3
+# Copyright         2025  Xiaomi Corp.        (authors: Han Zhu)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script prepares lhotse manifest files from the raw OpenDialog datasets.
+We assume that you have downloaded the OpenDialog dataset and untarred the
+tar files in audio/en and audio/zh so that the mp3 files are placed under
+these two directories.
+Download OpenDialog at https://huggingface.co/datasets/k2-fsa/OpenDialog
+or https://www.modelscope.cn/datasets/k2-fsa/OpenDialog
+"""
+import argparse
+import json
+import logging
+import math
+import re
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from pathlib import Path
+from typing import List, Optional, Tuple
+from lhotse import CutSet, validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.cut import Cut
+from lhotse.qa import fix_manifests
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike
+from tqdm.auto import tqdm
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        help="The path of OpenDialog dataset.",
+    )
+    parser.add_argument(
+        "--num-jobs",
+        type=int,
+        default=20,
+        help="Number of jobs to processing.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/manifests",
+        help="The destination directory of manifest files.",
+    )
+    parser.add_argument(
+        "--sampling-rate",
+        type=int,
+        default=24000,
+        help="The target sampling rate.",
+    )
+    return parser.parse_args()
+def _parse_recording(
+    wav_path: str,
+) -> Tuple[Recording, str]:
+    """
+    :param wav_path: Path to the audio file
+    :return: a tuple of "recording" and "recording_id"
+    """
+    recording_id = Path(wav_path).stem
+    recording = Recording.from_file(path=wav_path, recording_id=recording_id)
+    return recording, recording_id
+def _parse_supervision(
+    supervision: List, recording_dict: dict
+) -> Optional[SupervisionSegment]:
+    """
+    :param line: A line from the TSV file
+    :param recording_dict: Dictionary mapping recording IDs to Recording objects
+    :return: A SupervisionSegment object
+    """
+    def _round_down(num, ndigits=0):
+        factor = 10**ndigits
+        return math.floor(num * factor) / factor
+    uniq_id, text, wav_path, start, end = supervision
+    try:
+        recording_id = Path(wav_path).stem
+        recording = recording_dict[recording_id]
+        duration = (
+            _round_down(end - start, ndigits=8)
+            if end is not None
+            else _round_down(recording.duration, ndigits=8)
+        )
+        assert duration <= recording.duration, f"Duration {duration} is greater than "
+        f"recording duration {recording.duration}"
+        text = re.sub("_", " ", text)  # "_" is treated as padding symbol
+        text = re.sub(r"\s+", " ", text)  # remove extra whitespace
+        return SupervisionSegment(
+            id=f"{uniq_id}",
+            recording_id=recording.id,
+            start=start,
+            duration=duration,
+            channel=recording.channel_ids,
+            text=text.strip(),
+        )
+    except Exception as e:
+        logging.info(f"Error processing line: {e}")
+        return None
+def prepare_subset(
+    jsonl_path: Pathlike,
+    lang: str,
+    sampling_rate: int,
+    num_jobs: int,
+    output_dir: Pathlike,
+):
+    """
+    Returns the manifests which consist of the Recordings and Supervisions
+    :param jsonl_path: Path to the jsonl file
+    :param lang: Language of the subset
+    :param sampling_rate: Target sampling rate of the audio
+    :param num_jobs: Number of processes for parallel processing
+    :param output_dir: Path where to write the manifests
+    """
+    logging.info(f"Preparing {lang} subset")
+    # Step 1: Read all unique recording paths
+    logging.info(f"Reading {jsonl_path}")
+    recordings_path_set = set()
+    supervision_list = list()
+    with open(jsonl_path, "r") as fr:
+        for line in fr:
+            try:
+                items = json.loads(line)
+                uniq_id, text, wav_path = items["id"], items["text"], items["path"]
+                start, end = 0, None
+                recordings_path_set.add(jsonl_path.parent / wav_path)
+                supervision_list.append((uniq_id, text, wav_path, start, end))
+            except Exception as e:
+                logging.warning(f"Error {e} when decoding JSON line: {line}")
+                continue
+    logging.info("Starting to process recordings...")
+    # Step 2: Process recordings
+    futures = []
+    recording_dict = {}
+    with ThreadPoolExecutor(max_workers=num_jobs) as ex:
+        for wav_path in tqdm(recordings_path_set, desc="Submitting jobs"):
+            futures.append(ex.submit(_parse_recording, wav_path))
+        for future in tqdm(futures, desc="Processing recordings"):
+            try:
+                recording, recording_id = future.result()
+                recording_dict[recording_id] = recording
+            except Exception as e:
+                logging.warning(
+                    f"Error processing recording {recording_id} with error: {e}"
+                )
+        recording_set = RecordingSet.from_recordings(recording_dict.values())
+    logging.info("Starting to process supervisions...")
+    # Step 3: Process supervisions
+    supervisions = []
+    for supervision in tqdm(supervision_list, desc="Processing supervisions"):
+        seg = _parse_supervision(supervision, recording_dict)
+        if seg is not None:
+            supervisions.append(seg)
+    logging.info("Processing Cuts...")
+    # Step 4: Create and validate manifests
+    supervision_set = SupervisionSet.from_segments(supervisions)
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
+    validate_recordings_and_supervisions(recording_set, supervision_set)
+    cut_set = CutSet.from_manifests(
+        recordings=recording_set, supervisions=supervision_set
+    )
+    cut_set = cut_set.sort_by_recording_id()
+    if sampling_rate != 24000:
+        # All OpenDialog audios are 24kHz
+        cut_set = cut_set.resample(sampling_rate)
+    cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+    logging.info("Saving cuts to disk...")
+    # Step 5: Write manifests to disk
+    cut_set.to_file(output_dir / f"opendialog_cuts_raw_{lang.upper()}-all.jsonl.gz")
+    dev_cut_set = cut_set.subset(first=1000)
+    dev_cut_set.to_file(output_dir / f"opendialog_cuts_raw_{lang.upper()}-dev.jsonl.gz")
+    def remove_dev(c: Cut, set: set):
+        if c.id in set:
+            return False
+        return True
+    _remove_dev = partial(remove_dev, set=set(dev_cut_set.ids))
+    train_cut_set = cut_set.filter(_remove_dev)
+    train_cut_set.to_file(
+        output_dir / f"opendialog_cuts_raw_{lang.upper()}-train.jsonl.gz"
+    )
+def prepare_dataset(
+    dataset_path: Pathlike,
+    sampling_rate: int,
+    num_jobs: int,
+    output_dir: Pathlike,
+):
+    for lang in ["en", "zh"]:
+        jsonl_path = dataset_path / f"manifest.{lang}.jsonl"
+        prepare_subset(
+            jsonl_path=jsonl_path,
+            lang=lang,
+            sampling_rate=sampling_rate,
+            num_jobs=num_jobs,
+            output_dir=output_dir,
+        )
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_args()
+    dataset_path = Path(args.dataset_path)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    prepare_dataset(
+        dataset_path=dataset_path,
+        sampling_rate=args.sampling_rate,
+        num_jobs=args.num_jobs,
+        output_dir=output_dir,
+    )

egs/zipvoice_dialog/run_custom.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/bin/bash
+# This script is an example of training ZipVoice-Dialog on your custom datasets.
+# Only support English and Chinese for now.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on:
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=6
+# Number of jobs for data preparation
+nj=20
+download_dir=download/
+# Maximum length (seconds) of the training utterance, will filter out longer utterances
+max_len=60
+# We suppose you have two TSV files: "data/raw/custom_train.tsv" and
+# "data/raw/custom_dev.tsv", where "custom" is your dataset name,
+# "train"/"dev" are used for training and validation respectively.
+# Each line of the TSV files should be in one of the following formats:
+# (1) `{uniq_id}\t{text}\t{wav_path}` if the text corresponds to the full wav,
+# (2) `{uniq_id}\t{text}\t{wav_path}\t{start_time}\t{end_time}` if text corresponds
+#     to part of the wav. The start_time and end_time specify the start and end
+#     times of the text within the wav, which should be in seconds.
+# > Note: {uniq_id} must be unique for each line.
+# > Note: {text} uses [S1] and [S2] tags to distinguish speakers, and must be begin with [S1].
+# > eg: "[S1] Hello. [S2] How are you? [S1] I'm fine. [S2] What's your name?"
+for subset in train dev;do
+      file_path=data/raw/custom_${subset}.tsv
+      [ -f "$file_path" ] || { echo "Error: expect $file_path !" >&2; exit 1; }
+done
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Prepare manifests for custom dataset from tsv files"
+      for subset in train dev;do
+            python3 -m zipvoice.bin.prepare_dataset \
+                  --tsv-path data/raw/custom_${subset}.tsv \
+                  --prefix custom \
+                  --subset raw_${subset} \
+                  --num-jobs ${nj} \
+                  --output-dir data/manifests
+      done
+      # The output manifest files are "data/manifests/custom_cuts_raw_train.jsonl.gz".
+      # and "data/manifests/custom_cuts_raw_dev.jsonl.gz".
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+      echo "Stage 2: Add tokens to manifests"
+      for subset in train dev;do
+            python3 -m zipvoice.bin.prepare_tokens \
+                  --input-file data/manifests/custom_cuts_raw_${subset}.jsonl.gz \
+                  --output-file data/manifests/custom_cuts_${subset}.jsonl.gz \
+                  --tokenizer dialog
+      done
+      # The output manifest files are "data/manifests/custom_cuts_train.jsonl.gz".
+      # and "data/manifests/custom_cuts_dev.jsonl.gz".
+fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+      echo "Stage 3: Compute Fbank for custom dataset"
+      # You can skip this step and use `--on-the-fly-feats 1` in training stage
+      for subset in train dev; do
+            python3 -m zipvoice.bin.compute_fbank \
+                  --source-dir data/manifests \
+                  --dest-dir data/fbank \
+                  --dataset custom \
+                  --subset ${subset} \
+                  --num-jobs ${nj}
+      done
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Download tokens file, pretrained models"
+      # Uncomment this line to use HF mirror
+      # export HF_ENDPOINT=https://hf-mirror.com
+      # The token file is obtained by extending some tokens
+      # on the bases of the Emilia token file.
+      mkdir -p ${download_dir}
+      hf_repo=k2-fsa/ZipVoice
+      huggingface-cli download \
+            --local-dir ${download_dir} \
+            ${hf_repo} \
+            zipvoice_dialog/tokens.txt
+      # Pre-trained ZipVoice model is required as
+      # the initialization model.
+      for file in model.pt tokens.txt model.json; do
+            huggingface-cli download \
+                  --local-dir ${download_dir} \
+                  ${hf_repo} \
+                  zipvoice/${file}
+      done
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Train the ZipVoice-Dialog model"
+      python3 -m zipvoice.bin.train_zipvoice_dialog \
+            --world-size 4 \
+            --use-fp16 1 \
+            --base-lr 0.0001 \
+            --num-iters 60000 \
+            --max-duration 500 \
+            --max-len ${max_len} \
+            --checkpoint ${download_dir}/zipvoice/model.pt \
+            --model-config ${download_dir}/zipvoice/model.json \
+            --token-file ${download_dir}/zipvoice_dialog/tokens.txt \
+            --dataset custom \
+            --train-manifest data/fbank/custom_cuts_train.jsonl.gz \
+            --dev-manifest data/fbank/custom_cuts_dev.jsonl.gz \
+            --exp-dir exp/zipvoice_dialog_custom
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+      echo "Stage 6: Average the checkpoints for ZipVoice"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --iter 60000 \
+            --avg 2 \
+            --model-name zipvoice_dialog \
+            --exp-dir exp/zipvoice_dialog_custom
+      # The generated model is exp/zipvoice_dialog/iter-60000-avg-2.pt
+fi
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+      echo "Stage 6: Inference of the ZipVoice model"
+      python3 -m zipvoice.bin.infer_zipvoice_dialog \
+            --model-name zipvoice_dialog \
+            --model-dir exp/zipvoice_dialog_custom \
+            --checkpoint-name iter-60000-avg-2.pt \
+            --test-list test.tsv \
+            --res-dir results/test_dialog_custom
+fi

egs/zipvoice_dialog/run_eval.sh ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/bin/bash
+# This script is an example of evaluate TTS models with objective metrics reported in ZipVoice-Dialog paper.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on:
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=6
+download_dir=download/
+# Uncomment this line to use HF mirror
+# export HF_ENDPOINT=https://hf-mirror.com
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Download test sets (test-dialog)"
+      hf_repo=k2-fsa/TTS_eval_datasets
+      mkdir -p ${download_dir}/
+      file=dialog_testset.tar.gz
+      echo "Downloading ${file}..."
+      huggingface-cli download \
+            --repo-type dataset \
+            --local-dir ${download_dir}/ \
+            ${hf_repo} \
+            ${file}
+      echo "Extracting ${file}..."
+      tar -xzf ${download_dir}/${file} -C ${download_dir}/
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+      echo "Stage 2: Download all required evaluation models"
+      mkdir -p ${download_dir}/tts_eval_models
+      mkdir -p ${download_dir}
+      huggingface-cli download \
+        --local-dir ${download_dir}/tts_eval_models \
+        ${hf_repo}
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+      echo "Stage 3: Inference with the pre-trained ZipVoice model from huggingface"
+      for testset in test_dialog_en test_dialog_zh; do
+        if [ "$testset" = "test_dialog_en" ]; then
+                test_tsv=${download_dir}/dialog_testset/en/test.tsv
+        elif [ "$testset" = "test_dialog_zh" ]; then
+                test_tsv=${download_dir}/dialog_testset/zh/test.tsv
+        else
+                echo "Error: unknown testset ${testset}" >&2
+                exit 1
+        fi
+        echo "Inference on tetset ${testset}..."
+        python3 -m zipvoice.bin.infer_zipvoice_dialog \
+                --model-name zipvoice_dialog \
+                --test-list ${test_tsv} \
+                --res-dir results/${testset}
+      done
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Evaluation on test-dialog-en"
+      model_path=${download_dir}/tts_eval_models
+      wav_path=results/test_dialog_en
+      test_tsv=${download_dir}/dialog_testset/en/test.tsv
+      python3 -m zipvoice.eval.speaker_similarity.cpsim \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path}
+      python3 -m zipvoice.eval.wer.dialog \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path} \
+            --lang en
+      # cpWER mode: will only compute WER and cpWER
+      # for speech less than 30s
+      python3 -m zipvoice.eval.wer.dialog \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path} \
+            --lang en \
+            --cpwer
+      python3 -m zipvoice.eval.mos.utmos \
+            --wav-path ${wav_path} \
+            --model-dir ${model_path}
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Evaluation on test-dialog-zh"
+      model_path=${download_dir}/tts_eval_models
+      wav_path=results/test_dialog_zh
+      test_tsv=${download_dir}/dialog_testset/zh/test.tsv
+      python3 -m zipvoice.eval.speaker_similarity.cpsim \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path}
+      python3 -m zipvoice.eval.wer.dialog \
+            --wav-path ${wav_path} \
+            --test-list ${test_tsv} \
+            --model-dir ${model_path} \
+            --lang zh
+      python3 -m zipvoice.eval.mos.utmos \
+            --wav-path ${wav_path} \
+            --model-dir ${model_path}
+fi

egs/zipvoice_dialog/run_finetune.sh ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/bin/bash
+# This script is an example of fine-tune our pre-trained ZipVoice-Dialog on your custom datasets.
+# Only support English and Chinese for now.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on:
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=6
+# Number of jobs for data preparation
+nj=20
+# Maximum length (seconds) of the training utterance, will filter out longer utterances
+max_len=60
+download_dir=download/
+# We suppose you have two TSV files: "data/raw/custom_train.tsv" and
+# "data/raw/custom_dev.tsv", where "custom" is your dataset name,
+# "train"/"dev" are used for training and validation respectively.
+# Each line of the TSV files should be in one of the following formats:
+# (1) `{uniq_id}\t{text}\t{wav_path}` if the text corresponds to the full wav,
+# (2) `{uniq_id}\t{text}\t{wav_path}\t{start_time}\t{end_time}` if text corresponds
+#     to part of the wav. The start_time and end_time specify the start and end
+#     times of the text within the wav, which should be in seconds.
+# > Note: {uniq_id} must be unique for each line.
+# > Note: {text} uses [S1] and [S2] tags to distinguish speakers, and must be begin with [S1].
+# > eg: "[S1] Hello. [S2] How are you? [S1] I'm fine. [S2] What's your name?"
+for subset in train dev;do
+      file_path=data/raw/custom_${subset}.tsv
+      [ -f "$file_path" ] || { echo "Error: expect $file_path !" >&2; exit 1; }
+done
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Prepare manifests for custom dataset from tsv files"
+      for subset in train dev;do
+            python3 -m zipvoice.bin.prepare_dataset \
+                  --tsv-path data/raw/custom_${subset}.tsv \
+                  --prefix custom-finetune \
+                  --subset raw_${subset} \
+                  --num-jobs ${nj} \
+                  --output-dir data/manifests
+      done
+      # The output manifest files are "data/manifests/custom-finetune_cuts_raw_train.jsonl.gz".
+      # and "data/manifests/custom-finetune_cuts_raw_dev.jsonl.gz".
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+      echo "Stage 2: Add tokens to manifests"
+      for subset in train dev;do
+            python3 -m zipvoice.bin.prepare_tokens \
+                  --input-file data/manifests/custom-finetune_cuts_raw_${subset}.jsonl.gz \
+                  --output-file data/manifests/custom-finetune_cuts_${subset}.jsonl.gz \
+                  --tokenizer dialog
+      done
+      # The output manifest files are "data/manifests/custom-finetune_cuts_train.jsonl.gz".
+      # and "data/manifests/custom-finetune_cuts_dev.jsonl.gz".
+fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+      echo "Stage 3: Compute Fbank for custom dataset"
+      # You can skip this step and use `--on-the-fly-feats 1` in training stage
+      for subset in train dev; do
+            python3 -m zipvoice.bin.compute_fbank \
+                  --source-dir data/manifests \
+                  --dest-dir data/fbank \
+                  --dataset custom-finetune \
+                  --subset ${subset} \
+                  --num-jobs ${nj}
+      done
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Download pre-trained model, tokens file, and model config"
+      # Uncomment this line to use HF mirror
+      # export HF_ENDPOINT=https://hf-mirror.com
+      mkdir -p ${download_dir}
+      hf_repo=k2-fsa/ZipVoice
+      for file in model.pt tokens.txt model.json; do
+            huggingface-cli download \
+                  --local-dir ${download_dir} \
+                  ${hf_repo} \
+                  zipvoice_dialog/${file}
+      done
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Fine-tune the ZipVoice-Dialog model"
+      python3 -m zipvoice.bin.train_zipvoice_dialog \
+            --world-size 4 \
+            --use-fp16 1 \
+            --finetune 1 \
+            --base-lr 0.0001 \
+            --num-iters 10000 \
+            --save-every-n 1000 \
+            --max-duration 500 \
+            --max-len ${max_len} \
+            --checkpoint ${download_dir}/zipvoice_dialog/model.pt \
+            --model-config ${download_dir}/zipvoice_dialog/model.json \
+            --token-file ${download_dir}/zipvoice_dialog/tokens.txt \
+            --dataset custom \
+            --train-manifest data/fbank/custom-finetune_cuts_train.jsonl.gz \
+            --dev-manifest data/fbank/custom-finetune_cuts_dev.jsonl.gz \
+            --exp-dir exp/zipvoice_dialog_finetune
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+      echo "Stage 6: Average the checkpoints for ZipVoice"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --iter 10000 \
+            --avg 2 \
+            --model-name zipvoice_dialog \
+            --exp-dir exp/zipvoice_dialog_finetune
+      # The generated model is exp/zipvoice_dialog_finetune/iter-10000-avg-2.pt
+fi
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+      echo "Stage 7: Inference of the ZipVoice model"
+      python3 -m zipvoice.bin.infer_zipvoice_dialog \
+            --model-name zipvoice_dialog \
+            --model-dir exp/zipvoice_dialog_finetune \
+            --checkpoint-name iter-10000-avg-2.pt \
+            --test-list test.tsv \
+            --res-dir results/test_dialog_finetune
+fi

egs/zipvoice_dialog/run_opendialog.sh ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/bin/bash
+# This script is an example of training ZipVoice-Dialog on OpenDialog dataset.
+# Add project root to PYTHONPATH
+export PYTHONPATH=../../:$PYTHONPATH
+# Set bash to 'debug' mode, it will exit on:
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+stage=1
+stop_stage=6
+# Number of jobs for data preparation
+nj=20
+# We assume that you have downloaded the OpenDialog dataset
+# to download/OpenDialog and untarred the tar files in audio/en
+# and audio/zh so that the mp3 files are placed under these two directories.
+# Download OpenDialog at https://huggingface.co/datasets/k2-fsa/OpenDialog
+# or https://www.modelscope.cn/datasets/k2-fsa/OpenDialog
+data_dir=download/OpenDialog
+download_dir=download/
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+      echo "Stage 1: Prepare manifests for OpenDialog dataset"
+      python3 local/prepare_opendialog.py \
+            --dataset-path ${data_dir} \
+            --num-jobs ${nj} \
+            --output-dir data/manifests
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+      echo "Stage 2: Add tokens to manifests"
+      for subset in ZH-dev ZH-train EN-dev EN-train;do
+            python3 -m zipvoice.bin.prepare_tokens \
+                  --input-file data/manifests/opendialog_cuts_raw_${subset}.jsonl.gz \
+                  --output-file data/manifests/opendialog_cuts_${subset}.jsonl.gz \
+                  --tokenizer dialog
+      done
+fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+      echo "Stage 3: Compute Fbank for opendialog dataset"
+      # You can skip this step and use `--on-the-fly-feats 1` in training stage
+      for subset in ZH-dev ZH-train EN-dev EN-train;do
+            python3 -m zipvoice.bin.compute_fbank \
+                  --source-dir data/manifests \
+                  --dest-dir data/fbank \
+                  --dataset opendialog \
+                  --subset ${subset} \
+                  --num-jobs ${nj}
+      done
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+      echo "Stage 4: Download tokens file, pretrained models"
+      # Uncomment this line to use HF mirror
+      # export HF_ENDPOINT=https://hf-mirror.com
+      # The token file is obtained by extending some tokens
+      # on the bases of the Emilia token file.
+      mkdir -p ${download_dir}
+      hf_repo=k2-fsa/ZipVoice
+      huggingface-cli download \
+            --local-dir ${download_dir} \
+            ${hf_repo} \
+            zipvoice_dialog/tokens.txt
+      # Pre-trained ZipVoice model is required as
+      # the initialization model.
+      for file in model.pt tokens.txt model.json; do
+            huggingface-cli download \
+                  --local-dir ${download_dir} \
+                  ${hf_repo} \
+                  zipvoice/${file}
+      done
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+      echo "Stage 5: Train the ZipVoice-Dialog model"
+      python3 -m zipvoice.bin.train_zipvoice_dialog \
+            --world-size 8 \
+            --use-fp16 1 \
+            --base-lr 0.0001 \
+            --max-duration 500 \
+            --checkpoint ${download_dir}/zipvoice/model.pt \
+            --model-config ${download_dir}/zipvoice/model.json \
+            --token-file ${download_dir}/zipvoice_dialog/tokens.txt \
+            --dataset opendialog \
+            --manifest-dir data/fbank \
+            --exp-dir exp/zipvoice_dialog_opendialog
+fi
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+      echo "Stage 6: Average the checkpoints for ZipVoice"
+      python3 -m zipvoice.bin.generate_averaged_model \
+            --iter 60000 \
+            --avg 2 \
+            --model-name zipvoice_dialog \
+            --exp-dir exp/zipvoice_dialog_opendialog
+      # The generated model is exp/zipvoice_dialog_opendialog/iter-60000-avg-2.pt
+fi
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+      echo "Stage 7: Inference of the ZipVoice model"
+      python3 -m zipvoice.bin.infer_zipvoice_dialog \
+            --model-name zipvoice_dialog \
+            --model-dir exp/zipvoice_dialog_opendialog \
+            --checkpoint-name iter-60000-avg-2.pt \
+            --test-list test.tsv \
+            --res-dir results/test_dialog
+fi

infer.py ADDED Viewed

	@@ -0,0 +1,578 @@

+from typing import List, Dict, Tuple
+import torch
+from transformers import (
+    AutoTokenizer, AutoModelForTokenClassification,
+    DataCollatorForTokenClassification, Trainer, TrainingArguments
+)
+LABEL_LIST = ["O", "B-EN", "I-EN"]
+LABEL2ID = {l:i for i,l in enumerate(LABEL_LIST)}
+ID2LABEL = {i:l for l,i in LABEL2ID.items()}
+model_name = "meandyou200175/detect_english"
+model_detect = AutoModelForTokenClassification.from_pretrained(
+    model_name, num_labels=len(LABEL_LIST),
+    id2label=ID2LABEL, label2id=LABEL2ID
+)
+tokenizer_detect = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+def tokens_to_pred_spans(offsets: List[Tuple[int,int]], pred_ids: List[int]) -> List[Tuple[int,int]]:
+    spans=[]; cur=None
+    for (start,end), lid in zip(offsets, pred_ids):
+        if start==end: continue
+        lab = ID2LABEL.get(lid,"O")
+        if lab=="B-EN":
+            if cur: spans.append(cur)
+            cur=[start,end]
+        elif lab=="I-EN":
+            if cur: cur[1]=end
+            else: cur=[start,end]
+        else:
+            if cur: spans.append(cur); cur=None
+    if cur: spans.append(cur)
+    return [tuple(x) for x in spans]
+def merge_close_spans(spans: List[Dict], max_gap: int = 2) -> List[Dict]:
+    if not spans:
+        return []
+    merged = [spans[0]]
+    for cur in spans[1:]:
+        prev = merged[-1]
+        if cur["start"] - prev["end"] <= max_gap:
+            # gộp lại
+            prev["end"] = cur["end"]
+        else:
+            merged.append(cur)
+    return merged
+def infer_spans(text: str, tokenizer, model, max_length: int = 256) -> List[Dict]:
+    text = text.lower()
+    enc = tokenizer(text, return_offsets_mapping=True, truncation=True,
+                    max_length=max_length, return_tensors="pt")
+    offsets = enc["offset_mapping"][0].tolist()
+    with torch.no_grad():
+        out = model(**{k: v for k, v in enc.items() if k != "offset_mapping"})
+        pred_ids = out.logits.argmax(-1)[0].tolist()
+    spans = tokens_to_pred_spans(offsets, pred_ids)
+    spans = [{"start": s, "end": e} for (s, e) in spans]
+    spans = merge_close_spans(spans, max_gap=2)
+    # print(spans)
+    return spans
+import unicodedata
+def is_letter(ch: str) -> bool:
+    if not ch:
+        return False
+    # Nếu người dùng lỡ truyền vào tổ hợp có dấu (e + ◌́), chuẩn hoá về NFC:
+    ch = unicodedata.normalize("NFC", ch)
+    # Chỉ chấp nhận đúng 1 ký tự sau chuẩn hoá
+    if len(ch) != 1:
+        return False
+    # Nhóm 'L*' của Unicode: Lu, Ll, Lt, Lm, Lo
+    return unicodedata.category(ch).startswith('L')
+import re
+from itertools import chain
+from typing import List, Dict, Optional
+import logging
+from functools import reduce
+from piper_phonemize import phonemize_espeak
+class EspeakTokenizer():
+    """A tokenizer with Espeak g2p function, hỗ trợ English + Vietnamese."""
+    def __init__(self, token_file: Optional[str] = None, lang: str = "vi",
+                 tokenizer=None, model=None):
+        self.has_tokens = False
+        self.lang = lang
+        self.detector_tokenizer = tokenizer
+        self.detector_model = model
+        if token_file is None:
+            logging.debug("Initialize Tokenizer without tokens file, "
+                          "will fail when map to ids.")
+            return
+        self.token2id: Dict[str, int] = {}
+        with open(token_file, "r", encoding="utf-8") as f:
+            for line in f.readlines():
+                info = line.rstrip().split("\t")
+                token, id = info[0], int(info[1])
+                assert token not in self.token2id, token
+                self.token2id[token] = id
+        self.pad_id = self.token2id["_"]
+        self.vocab_size = len(self.token2id)
+        self.has_tokens = True
+    @staticmethod
+    def _flatten(phs):
+        """Phẳng hóa list-of-lists (hoặc trả lại list nếu đã phẳng)."""
+        if not phs:
+            return []
+        if isinstance(phs[0], (list, tuple)):
+            return list(chain.from_iterable(phs))
+        return list(phs)
+    def g2p_chunk(self, text: str, lang: str):
+        tokens = []
+        start = 0
+        for t in text:
+            if is_letter(t):
+                break
+            start = start + 1
+        # Giữ lại: khoảng trắng (\s+), từ (\w+), ký tự khác [^\w\s]
+        if start > 0 :
+            tokens.extend(self._flatten(text[0:start]))
+        phs = phonemize_espeak(text[start:], lang)   # có thể trả về list-of-lists
+        tokens.extend(self._flatten(phs))
+        return tokens
+    def g2p(self, text: str) -> List[str]:
+        """Tách text thành spans EN/VI rồi phonemize tương ứng, bảo toàn khoảng trắng/dấu câu."""
+        try:
+            # Fallback: không có detector => phonemize toàn chuỗi theo self.lang,
+            # nhưng qua g2p_chunk để không mất khoảng trắng/dấu câu.
+            if self.detector_tokenizer is None or self.detector_model is None:
+                return self.g2p_chunk(text, self.lang)
+            spans = infer_spans(text, self.detector_tokenizer, self.detector_model)
+            spans = sorted(spans, key=lambda x: x["start"])
+            tokens_all = []
+            last = 0
+            for sp in spans:
+                s, e = sp["start"], sp["end"]
+                # phần trước đoạn EN -> VI
+                if s > last:
+                    vi_chunk = text[last:s]
+                    if vi_chunk:
+                        tokens_all.extend(self.g2p_chunk(vi_chunk, "vi"))
+                # đoạn EN
+                en_chunk = text[s:e]
+                if en_chunk:
+                    tokens_all.extend([" "])
+                    tokens_all.extend(self.g2p_chunk(en_chunk, "en"))
+                last = e
+            # phần còn lại sau EN -> VI
+            if last < len(text):
+                vi_chunk = text[last:]
+                if vi_chunk:
+                    tokens_all.extend(self.g2p_chunk(vi_chunk, "vi"))
+            return tokens_all
+        except Exception as ex:
+            logging.warning(f"Tokenization of mixed {self.lang} texts failed: {ex}")
+            return []
+    def texts_to_token_ids(
+        self,
+        texts: List[str],
+    ) -> List[List[int]]:
+        return self.tokens_to_token_ids(self.texts_to_tokens(texts))
+    def texts_to_tokens(
+        self,
+        texts: List[str],
+    ) -> List[List[str]]:
+        tokens_list = [self.g2p(texts[i]) for i in range(len(texts))]
+        return tokens_list
+    def tokens_to_token_ids(
+        self,
+        tokens_list: List[List[str]],
+    ) -> List[List[int]]:
+        assert self.has_tokens, "Please initialize Tokenizer with a tokens file."
+        token_ids_list = []
+        for tokens in tokens_list:
+            token_ids = []
+            for t in tokens:
+                if t not in self.token2id:
+                    logging.debug(f"Skip OOV {t}")
+                    continue
+                token_ids.append(self.token2id[t])
+            token_ids_list.append(token_ids)
+        return token_ids_list
+import re  # <-- thêm
+import random
+import datetime as dt
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import safetensors.torch
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from lhotse.utils import fix_random_seed
+from vocos import Vocos
+from zipvoice.models.zipvoice import ZipVoice
+from zipvoice.models.zipvoice_distill import ZipVoiceDistill
+# from zipvoice.tokenizer.tokenizer import EmiliaTokenizer, EspeakTokenizer, LibriTTSTokenizer, SimpleTokenizer, SimpleTokenizer2
+from zipvoice.utils.checkpoint import load_checkpoint
+from zipvoice.utils.common import AttributeDict
+from zipvoice.utils.feature import VocosFbank
+def load_vocab(file_path):
+    """Đọc file vocab dạng char <tab> id -> trả về dict {id: char}"""
+    id2char = {}
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            # bỏ \n nhưng giữ lại space đầu dòng
+            line = line.rstrip("\n")
+            parts = line.split("\t")
+            if len(parts) != 2:
+                continue  # bỏ qua dòng lỗi
+            char, idx = parts
+            id2char[int(idx)] = char
+    return id2char
+def tokens_to_text(tokens, id2char):
+    """Chuyển list token về string"""
+    return "".join(id2char.get(t, "<unk>") for t in tokens)
+def get_vocoder(vocos_local_path: Optional[str] = None):
+    if vocos_local_path:
+        vocoder = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
+        state_dict = torch.load(
+            f"{vocos_local_path}/pytorch_model.bin",
+            weights_only=True,
+            map_location="cpu",
+        )
+        vocoder.load_state_dict(state_dict)
+    else:
+        vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+    return vocoder
+HUGGINGFACE_REPO = "k2-fsa/ZipVoice"
+MODEL_DIR = {
+    "zipvoice": "zipvoice",
+    "zipvoice_distill": "zipvoice_distill",
+}
+model_dir="zipvoice_finetune/"
+checkpoint_name="iter-525000-avg-2.pt"
+# checkpoint_name="model.pt"
+model_dir = Path(model_dir)
+model_ckpt = model_dir / checkpoint_name
+model_config_path = model_dir / "model.json"
+token_file = model_dir / "tokens.txt"
+tokenizer = EspeakTokenizer(token_file=token_file, tokenizer=tokenizer_detect, model=model_detect)
+tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
+with open(model_config_path, "r") as f:
+    model_config = json.load(f)
+# --- Init model ---
+model = ZipVoice(**model_config["model"], **tokenizer_config)
+if str(model_ckpt).endswith(".safetensors"):
+    safetensors.torch.load_model(model, model_ckpt)
+else:
+    load_checkpoint(filename=model_ckpt, model=model, strict=True)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device).eval()
+# --- Vocoder & features ---
+vocoder = get_vocoder(None).to(device).eval()
+feature_extractor = VocosFbank()
+sampling_rate = model_config["feature"]["sampling_rate"]
+import torch
+import numpy as np
+import torch
+import numpy as np
+def score_tokens(A):
+    B = [9, 14, 18, 21, 27, 33, 37, 39, 42, 45, 50, 51, 52, 54, 58, 59, 61, 62, 63, 69, 73, 74, 79, 85, 99, 100, 102, 105, 119, 120, 121, 122, 123, 124, 141, 143, 144, 145, 146, 157, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 349, 350, 353, 356, 357, 358, 359]
+    total_score = 0
+    # Thêm 3 vào đầu và cuối
+    tokens = [3] + A + [3]
+    # Tách chuỗi theo số 3
+    segment = []
+    for t in tokens:
+        if t == 3:
+            if segment:  # xử lý 1 đoạn
+                count = 0
+                for i in range(len(segment) - 1):
+                    if (segment[i] in B and segment[i+1] not in B):
+                        # print(f"{segment[i]} in B and {segment[i+1]} not in B)")
+                        count += 1
+                if segment[-1] in B:
+                    # print(f"{segment[-1]} in B")
+                    count += 1
+                if count > 0:
+                    total_score += 1 + (count - 1) * 0.5
+            segment = []
+        else:
+            segment.append(t)
+    return total_score
+def trim_leading_silence_torch(
+    wav: torch.Tensor,
+    sample_rate: int,
+    silence_thresh: float = 0.05,
+    chunk_ms: int = 10,
+    extend_ms: int = 20,
+    ratio: float = 0.95,  # % sample phải dưới ngưỡng để coi là im lặng
+):
+    wav_np = wav.squeeze(0).cpu().numpy().astype(np.float32)
+    norm_wav = wav_np / (np.max(np.abs(wav_np)) + 1e-8)
+    chunk_size = int(sample_rate * chunk_ms / 1000)
+    total_chunks = int(len(norm_wav) / chunk_size)
+    start_idx = 0
+    for i in range(total_chunks):
+        chunk = norm_wav[i * chunk_size : (i + 1) * chunk_size]
+        # Tính tỷ lệ sample dưới ngưỡng
+        silent_ratio = np.mean(np.abs(chunk) < silence_thresh)
+        if silent_ratio < ratio:  # nếu ít hơn 95% sample im lặng → coi là có tiếng
+            start_idx = max(0, i * chunk_size - int(sample_rate * extend_ms / 1000))
+            break
+    return wav[:, start_idx:]
+@torch.inference_mode()
+def run_zipvoice(
+    model_name="zipvoice",
+    model_dir="zipvoice_finetune",
+    checkpoint_name="model.pt",
+    vocoder_path=None,
+    tokenizer_name="emilia",
+    lang="en-us",
+    test_list=None,  # path to tsv file
+    prompt_wav=None,
+    prompt_text=None,
+    text=None,
+    res_dir="results",
+    res_wav_path="result.wav",
+    guidance_scale=None,
+    num_step=None,
+    feat_scale=0.1,
+    speed=1.0,
+    t_shift=0.5,
+    target_rms=0.1,
+    seed=666,
+):
+    text = text.lower()
+    # --- Default settings per model ---
+    model_defaults = {
+        "zipvoice": {"num_step": 16, "guidance_scale": 1.0},
+        "zipvoice_distill": {"num_step": 8, "guidance_scale": 3.0},
+    }
+    # sửa cách gán mặc định (không dùng locals() nữa)
+    if guidance_scale is None:
+        guidance_scale = model_defaults.get(model_name, {}).get("guidance_scale", 1.0)
+    if num_step is None:
+        num_step = model_defaults.get(model_name, {}).get("num_step", 16)
+    # --- Check inputs ---
+    assert (test_list is not None) ^ ((prompt_wav and prompt_text and text) is not None), \
+        "Cần test_list hoặc (prompt_wav + prompt_text + text)"
+    fix_random_seed(seed)
+    # --- Load tokenizer, model, vocoder, features ... (phần này giữ nguyên) ---
+    # [giữ nguyên toàn bộ phần load tokenizer/model/vocoder/feature_extractor/sampling_rate]
+    # ---------------------------
+    # NEW: Hàm chia đoạn văn bản
+    # ---------------------------
+    def split_text_into_chunks(s: str, min_chars: int = 15, max_chars: int = 30):
+        """
+        Chia theo dấu ',' hoặc '.', sau đó gộp/xẻ để mỗi đoạn dài trong [min_chars, max_chars].
+        Không cắt giữa từ.
+        """
+        # normalize khoảng trắng
+        s = re.sub(r"\s+", " ", (s or "").strip())
+        if not s:
+            return []
+        # tách theo dấu , hoặc .
+        raw_segs = [seg.strip() for seg in re.split(r"\s*[.,]\s*", s) if seg.strip()]
+        chunks = []
+        i = 0
+        while i < len(raw_segs):
+            cur = raw_segs[i]
+            i += 1
+            # gộp tiếp theo nếu cur quá ngắn
+            while len(cur) < min_chars and i < len(raw_segs):
+                cur = (cur + ", " + raw_segs[i]).strip()
+                i += 1
+            # nếu cur quá dài, xẻ theo từ để <= max_chars
+            if len(cur) > max_chars:
+                words = cur.split()
+                buf = []
+                cur_len = 0
+                for w in words:
+                    # +1 cho khoảng trắng nếu cần
+                    add_len = len(w) if cur_len == 0 else len(w) + 1
+                    if cur_len + add_len <= max_chars:
+                        buf.append(w)
+                        cur_len += add_len
+                    else:
+                        # đóng lại một chunk
+                        part = ", ".join(buf).strip()
+                        if part:
+                            chunks.append(part)
+                        # bắt đầu chunk mới
+                        buf = [w]
+                        cur_len = len(w)
+                # phần còn lại
+                last = " ".join(buf).strip()
+                if last:
+                    # nếu phần cuối vẫn < min_chars và có thể gộp với chunk trước đó
+                    if len(last) < min_chars and chunks:
+                        merged = (chunks[-1] + " " + last).strip()
+                        if len(merged) <= max_chars:
+                            chunks[-1] = merged
+                        else:
+                            chunks.append(last)  # đành chấp nhận (nhưng thường ít gặp)
+                    else:
+                        chunks.append(last)
+            else:
+                chunks.append(cur)
+        # vòng tinh chỉnh cuối: nếu chunk cuối quá ngắn, gộp vào trước đó
+        if len(chunks) >= 2 and len(chunks[-1]) < min_chars:
+            merged = (chunks[-2] + ", " + chunks[-1]).strip()
+            if len(merged) <= max_chars:
+                chunks[-2] = merged
+                chunks.pop()
+        # print(chunks)
+        final_chunk = []
+        for chunk in chunks:
+            chunk = ", " + chunk + ","
+            final_chunk.append(chunk)
+        return final_chunk
+    # ---------------------------
+    # MODIFIED: generate_sentence synth theo từng đoạn + nối lại
+    # ---------------------------
+    def generate_sentence(save_path, prompt_text, prompt_wav, text):
+        # chuẩn hoá & chia đoạn
+        segments = split_text_into_chunks(text, min_chars=50, max_chars=200)
+        if not segments:
+            # không có gì để nói: xuất file rỗng 0.2s
+            silence = torch.zeros((1, int(0.2 * sampling_rate)))
+            torchaudio.save(save_path, silence, sample_rate=sampling_rate)
+            return
+        # chuẩn bị prompt (làm 1 lần)
+        prompt_tokens = tokenizer.texts_to_token_ids([prompt_text])
+        prompt_wav_tensor, sr = torchaudio.load(prompt_wav)
+        if sr != sampling_rate:
+            prompt_wav_tensor = torchaudio.transforms.Resample(sr, sampling_rate)(prompt_wav_tensor)
+        prompt_rms_val = torch.sqrt(torch.mean(prompt_wav_tensor**2))
+        if prompt_rms_val < target_rms:
+            prompt_wav_tensor *= target_rms / prompt_rms_val
+        prompt_features = feature_extractor.extract(
+            prompt_wav_tensor, sampling_rate=sampling_rate
+        ).to(device)
+        prompt_features = prompt_features.unsqueeze(0) * feat_scale
+        prompt_features_lens = torch.tensor([prompt_features.size(1)], device=device)
+        # print(prompt_features_lens)
+        num_space_prompt = prompt_text.count(" ")
+        # khoảng lặng 0.2s
+        gap_duration = random.uniform(0.17, 0.2)  # số ngẫu nhiên từ 0.17 đến 0.2
+        gap = torch.zeros((1, int(gap_duration * sampling_rate)))
+        wav_parts = []
+        print("segments",segments)
+        for idx, seg in enumerate(segments):
+            # print(seg)
+            num_space_text = seg.count(" ")
+            tokens = tokenizer.texts_to_token_ids([seg])
+            # print(tokens)
+            score = score_tokens(tokens[0])
+            # print(score)
+            # print(prompt_tokens)
+            score_prompt = score_tokens(prompt_tokens[0])
+            # print(score_prompt)
+            vocab_file = "zipvoice_finetune/tokens.txt"   # file txt dạng bạn đưa
+            id2char = load_vocab(vocab_file)
+            decoded_text = tokens_to_text(tokens[0], id2char)
+            print(decoded_text)
+            pred_features, _, _, _ = model.sample(
+                num_space_text=[num_space_text],
+                num_space_prompt=[num_space_prompt],
+                tokens=tokens,
+                prompt_tokens=prompt_tokens,
+                prompt_features=prompt_features,
+                prompt_features_lens=prompt_features_lens,
+                speed= speed,
+                t_shift= t_shift,
+                duration="predict",
+                num_step= num_step,
+                guidance_scale= guidance_scale,
+            )
+            pred_features = pred_features.permute(0, 2, 1) / feat_scale
+            wav = vocoder.decode(pred_features).squeeze(1).clamp(-1, 1)
+            # phục hồi mức âm lượng tương quan prompt
+            if prompt_rms_val < target_rms:
+                wav *= prompt_rms_val / target_rms
+            wav = trim_leading_silence_torch(
+                wav, sample_rate=sampling_rate, silence_thresh=0.086, chunk_ms=10, extend_ms=20
+            )
+            wav_parts.append(wav.cpu())
+            if idx < len(segments) - 1:
+                wav_parts.append(gap)  # chèn khoảng lặng
+        final_wav = torch.cat(wav_parts, dim=-1)  # [1, T_total]
+        torchaudio.save(save_path, final_wav, sample_rate=sampling_rate)
+    # --- generate_list giữ nguyên: gọi generate_sentence nên tự áp dụng chia đoạn ---
+    def generate_list(res_dir, test_list):
+        os.makedirs(res_dir, exist_ok=True)
+        with open(test_list, "r", encoding="utf-8") as fr:
+            for i, line in enumerate(fr):
+                wav_name, prompt_text, prompt_wav, text = line.strip().split("\t")
+                save_path = f"{res_dir}/{wav_name}.wav"
+                generate_sentence(save_path, prompt_text, prompt_wav, text)
+    # --- Run ---
+    if test_list:
+        generate_list(res_dir, test_list)
+    else:
+        generate_sentence(res_wav_path, prompt_text, prompt_wav, text)
+    print("✅ Hoàn thành!")
+    return text,

proccess_wav.py ADDED Viewed

	@@ -0,0 +1,364 @@

+from typing import List, Tuple
+import numpy as np
+from pydub import AudioSegment
+import os
+from chunkformer import ChunkFormerModel
+from clearvoice import ClearVoice
+# ======================= ASR + CLEARVOICE + AUDIO PROCESSING =======================
+ASR_MODEL = None
+CLEARVOICE_MODEL = None
+REF_AUDIO_CACHE = {}  # cache: đường dẫn input -> đường dẫn output đã xử lý
+def get_asr_model() -> ChunkFormerModel:
+    """Lazy-load ChunkFormer (ASR, chạy trên CPU)."""
+    global ASR_MODEL
+    if ASR_MODEL is None:
+        ASR_MODEL = ChunkFormerModel.from_pretrained("khanhld/chunkformer-ctc-large-vie")
+    return ASR_MODEL
+def get_clearvoice_model() -> ClearVoice:
+    """Lazy-load ClearVoice để khử nhiễu ref audio."""
+    global CLEARVOICE_MODEL
+    if CLEARVOICE_MODEL is None:
+        CLEARVOICE_MODEL = ClearVoice(
+            task="speech_enhancement",
+            model_names=["MossFormer2_SE_48K"],
+        )
+    return CLEARVOICE_MODEL
+def find_silent_regions(
+    audio: AudioSegment,
+    silence_thresh: float = 0.05,  # biên độ sau chuẩn hoá [-1, 1]
+    chunk_ms: int = 10,
+    min_silence_len: int = 200,
+) -> List[Tuple[int, int]]:
+    """
+    Tìm các khoảng lặng (start_ms, end_ms) trong AudioSegment dựa trên biên độ.
+    """
+    samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
+    if audio.channels > 1:
+        samples = samples.reshape((-1, audio.channels)).mean(axis=1)
+    norm = samples / (2 ** (audio.sample_width * 8 - 1))
+    sr = audio.frame_rate
+    chunk_size = max(1, int(sr * chunk_ms / 1000))
+    total_chunks = len(norm) // chunk_size
+    silent_regions: List[Tuple[int, int]] = []
+    start = None
+    for i in range(total_chunks):
+        chunk = norm[i * chunk_size: (i + 1) * chunk_size]
+        if chunk.size == 0:
+            continue
+        if np.all((chunk > -silence_thresh) & (chunk < silence_thresh)):
+            if start is None:
+                start = i
+        else:
+            if start is not None:
+                dur = (i - start) * chunk_ms
+                if dur >= min_silence_len:
+                    silent_regions.append((start * chunk_ms, i * chunk_ms))
+                start = None
+    if start is not None:
+        dur = (total_chunks - start) * chunk_ms
+        if dur >= min_silence_len:
+            silent_regions.append((start * chunk_ms, total_chunks * chunk_ms))
+    return silent_regions
+def trim_leading_trailing_silence(
+    audio: AudioSegment,
+    silence_thresh: float = 0.05,
+    chunk_ms: int = 10,
+    min_silence_len: int = 200,
+) -> AudioSegment:
+    """
+    Bỏ khoảng lặng đầu/cuối file.
+    """
+    duration = len(audio)
+    silent_regions = find_silent_regions(
+        audio,
+        silence_thresh=silence_thresh,
+        chunk_ms=chunk_ms,
+        min_silence_len=min_silence_len,
+    )
+    if not silent_regions:
+        return audio
+    start_trim = 0
+    end_trim = duration
+    # khoảng lặng đầu file
+    first_start, first_end = silent_regions[0]
+    if first_start <= 0:
+        start_trim = max(start_trim, first_end)
+    # khoảng lặng cuối file
+    last_start, last_end = silent_regions[-1]
+    if last_end >= duration:
+        end_trim = min(end_trim, last_start)
+    return audio[start_trim:end_trim]
+def compress_internal_silence(
+    audio: AudioSegment,
+    max_silence_ms: int = 300,
+    silence_thresh: float = 0.05,
+    chunk_ms: int = 10,
+    min_silence_len: int = 50,
+) -> AudioSegment:
+    """
+    Rút ngắn khoảng lặng giữa file:
+    - Khoảng lặng <= max_silence_ms: giữ nguyên
+    - Khoảng lặng > max_silence_ms: cắt còn max_silence_ms
+    """
+    duration = len(audio)
+    silent_regions = find_silent_regions(
+        audio,
+        silence_thresh=silence_thresh,
+        chunk_ms=chunk_ms,
+        min_silence_len=min_silence_len,
+    )
+    if not silent_regions:
+        return audio
+    new_audio = AudioSegment.silent(duration=0, frame_rate=audio.frame_rate)
+    cursor = 0
+    for s_start, s_end in silent_regions:
+        # phần có tiếng nói trước khoảng lặng
+        if s_start > cursor:
+            new_audio += audio[cursor:s_start]
+        silence_len = s_end - s_start
+        if silence_len <= max_silence_ms:
+            new_audio += audio[s_start:s_end]
+        else:
+            new_audio += audio[s_start: s_start + max_silence_ms]
+        cursor = s_end
+    # phần còn lại sau khoảng lặng cuối
+    if cursor < duration:
+        new_audio += audio[cursor:]
+    return new_audio
+def select_subsegment_by_silence(
+    audio: AudioSegment,
+    min_len_ms: int = 5000,
+    max_len_ms: int = 10000,
+    silence_thresh: float = 0.05,
+    chunk_ms: int = 10,
+    min_silence_len: int = 200,
+) -> AudioSegment:
+    """
+    Nếu audio > max_len_ms, chọn 1 đoạn dài trong khoảng [min_len_ms, max_len_ms],
+    cắt tại điểm nằm trong kho���ng lặng để tránh cắt dính giọng nói.
+    """
+    duration = len(audio)
+    if duration <= max_len_ms:
+        return audio
+    silent_regions = find_silent_regions(
+        audio,
+        silence_thresh=silence_thresh,
+        chunk_ms=chunk_ms,
+        min_silence_len=min_silence_len,
+    )
+    if not silent_regions:
+        # không tìm được khoảng lặng -> lấy đoạn giữa
+        target_len = min(max_len_ms, duration)
+        start = max(0, (duration - target_len) // 2)
+        end = start + target_len
+        return audio[start:end]
+    # boundary là midpoint của khoảng lặng (chắc chắn nằm trong vùng im lặng)
+    boundaries = [0]
+    for s_start, s_end in silent_regions:
+        mid = (s_start + s_end) // 2
+        if 0 < mid < duration:
+            boundaries.append(mid)
+    boundaries.append(duration)
+    boundaries = sorted(set(boundaries))
+    # ưu tiên đoạn đầu tiên thỏa 5–10s
+    for i in range(len(boundaries)):
+        for j in range(i + 1, len(boundaries)):
+            seg_len = boundaries[j] - boundaries[i]
+            if min_len_ms <= seg_len <= max_len_ms:
+                return audio[boundaries[i]:boundaries[j]]
+    # nếu không có đoạn nào nằm trọn trong [min, max], chọn đoạn gần max_len nhất
+    best_i, best_j, best_diff = 0, None, None
+    for i in range(len(boundaries)):
+        for j in range(i + 1, len(boundaries)):
+            seg_len = boundaries[j] - boundaries[i]
+            if seg_len >= min_len_ms:
+                diff = abs(seg_len - max_len_ms)
+                if best_diff is None or diff < best_diff:
+                    best_diff = diff
+                    best_i, best_j = i, j
+    if best_j is not None:
+        return audio[boundaries[best_i]:boundaries[best_j]]
+    # fallback cuối cùng
+    target_len = min(max_len_ms, duration)
+    start = max(0, (duration - target_len) // 2)
+    end = start + target_len
+    return audio[start:end]
+def enhance_ref_audio(input_path: str) -> str:
+    """
+    Pipeline xử lý WAV cho TTS:
+    - ClearVoice khử nhiễu
+    - Bỏ khoảng lặng đầu/cuối
+    - Rút ngắn khoảng lặng giữa > 0.3s thành 0.3s
+    - Nếu audio > 10s: chọn 1 đoạn 5–10s, cắt tại khoảng lặng
+    Trả về đường dẫn file wav đã xử lý.
+    """
+    if not input_path:
+        raise ValueError("No input audio path for enhancement.")
+    # cache để cùng 1 file không phải xử lý nhiều lần
+    if input_path in REF_AUDIO_CACHE:
+        return REF_AUDIO_CACHE[input_path]
+    cv = get_clearvoice_model()
+    # 1) khử nhiễu
+    try:
+        cv_out = cv(input_path=input_path, online_write=False)
+        base = os.path.basename(input_path)
+        name, ext = os.path.splitext(base)
+        if not ext:
+            ext = ".wav"
+        denoised_path = os.path.join(os.path.dirname(input_path), f"{name}_denoised{ext}")
+        cv.write(cv_out, output_path=denoised_path)
+    except Exception as e:
+        print(f"[ClearVoice] Error during denoising, fallback to original: {e}")
+        denoised_path = input_path
+    # 2) pydub xử lý khoảng lặng + length
+    audio = AudioSegment.from_file(denoised_path)
+    # bỏ khoảng lặng đầu/cuối
+    audio = trim_leading_trailing_silence(audio)
+    # rút ngắn khoảng lặng giữa
+    audio = compress_internal_silence(audio, max_silence_ms=300)
+    # nếu >10s thì chọn đoạn trong khoảng 5–10s
+    audio = select_subsegment_by_silence(audio, min_len_ms=5000, max_len_ms=10000)
+    # 3) ghi ra file mới
+    enhanced_path = os.path.join(os.path.dirname(denoised_path), f"{name}_enhanced.wav")
+    audio.export(enhanced_path, format="wav")
+    REF_AUDIO_CACHE[input_path] = enhanced_path
+    return enhanced_path
+def split_audio_by_silence(
+    audio: AudioSegment,
+    silence_thresh: float = 0.05,
+    chunk_ms: int = 10,
+    min_silence_len: int = 200,
+    min_segment_len: int = 200,
+) -> List[Tuple[int, int]]:
+    """
+    Từ AudioSegment, trả về các đoạn có tiếng nói (non-silent)
+    được tách bằng khoảng lặng.
+    """
+    duration = len(audio)
+    silent_regions = find_silent_regions(
+        audio,
+        silence_thresh=silence_thresh,
+        chunk_ms=chunk_ms,
+        min_silence_len=min_silence_len,
+    )
+    segments: List[Tuple[int, int]] = []
+    cur_start = 0
+    for s_start, s_end in silent_regions:
+        if cur_start < s_start:
+            if s_start - cur_start >= min_segment_len:
+                segments.append((cur_start, s_start))
+        cur_start = s_end
+    if cur_start < duration and duration - cur_start >= min_segment_len:
+        segments.append((cur_start, duration))
+    # nếu không tìm được đoạn nào, lấy cả file
+    if not segments:
+        segments.append((0, duration))
+    return segments
+def transcribe_ref_audio(audio_path: str) -> str:
+    """
+    ASR theo yêu cầu:
+    - Cắt âm thanh theo khoảng lặng
+    - ASR từng đoạn
+    - Nối text bằng dấu phẩy
+    """
+    if not audio_path:
+        raise ValueError("No audio path for ASR.")
+    model = get_asr_model()
+    audio = AudioSegment.from_file(audio_path)
+    segments = split_audio_by_silence(audio)
+    texts = []
+    base, _ = os.path.splitext(audio_path)
+    for idx, (start_ms, end_ms) in enumerate(segments):
+        seg_audio = audio[start_ms:end_ms]
+        seg_path = f"{base}_seg_{idx}.wav"
+        seg_audio.export(seg_path, format="wav")
+        try:
+            transcription = model.endless_decode(
+                audio_path=seg_path,
+                chunk_size=32,
+                left_context_size=0,
+                right_context_size=0,
+                total_batch_duration=400,
+                return_timestamps=False,
+            )
+        except TypeError:
+            transcription = model.endless_decode(
+                audio_path=seg_path,
+                chunk_size=32,
+                left_context_size=0,
+                right_context_size=0,
+                total_batch_duration=400,
+            )
+        if isinstance(transcription, str):
+            text = transcription
+        else:
+            text = str(transcription)
+        text = text.strip()
+        if text:
+            texts.append(text)
+    return ", ".join(texts)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[tool.isort]
+profile = "black"
+[tool.black]
+line-length = 88

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+--find-links https://k2-fsa.github.io/icefall/piper_phonemize.html
+transformers==4.57.1
+torch
+torchaudio
+torchcodec
+numpy
+lhotse
+huggingface_hub
+safetensors
+tensorboard
+vocos
+# Normalization
+cn2an
+inflect
+# Tokenization
+jieba
+piper_phonemize
+pypinyin
+setuptools<81
+chunkformer
+clearvoice

requirements_eval.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+torch
+numpy
+# Audio processing
+librosa
+soundfile
+# Model
+s3prl
+pyannote.audio
+funasr
+transformers
+# WER
+jiwer==3.1.0
+# Normalization
+zhconv
+zhon

setup.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import subprocess
+import requests
+from dotenv import load_dotenv
+def run_cmd(cmd):
+    print(f"🔹 Chạy lệnh: {cmd}")
+    result = subprocess.run(cmd, shell=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Lệnh thất bại: {cmd}")
+def download_with_token(url, dest_path, token):
+    headers = {"Authorization": f"Bearer {token}"}
+    with requests.get(url, headers=headers, stream=True) as r:
+        r.raise_for_status()
+        with open(dest_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+    print(f"✅ Đã tải: {dest_path}")
+def main():
+    # Load biến môi trường từ .env
+    load_dotenv()
+    token = os.getenv("HF_TOKEN")
+    if not token:
+        raise EnvironmentError("❌ Thiếu biến môi trường HF_TOKEN. Hãy tạo file .env với dòng:\nHF_TOKEN=hf_your_token_here")
+    # Đăng nhập vào Hugging Face CLI
+    run_cmd(f"huggingface-cli login --token {token}")
+    # Tạo thư mục chứa model
+    os.makedirs("zipvoice_finetune", exist_ok=True)
+    # Danh sách file cần tải
+    files = {
+        "iter-525000-avg-2.pt": "https://huggingface.co/datasets/meandyou200175/temp_file/resolve/main/zip/epoch-46-all-speak-600h-en-norm.pt",
+        "model.json": "https://huggingface.co/datasets/meandyou200175/temp_file/resolve/main/zip/model.json",
+        "tokens.txt": "https://huggingface.co/datasets/meandyou200175/temp_file/resolve/main/zip/tokens.txt",
+    }
+    for filename, url in files.items():
+        dest = os.path.join("zipvoice_finetune", filename)
+        download_with_token(url, dest, token)
+    # Cài đặt requirements
+    if os.path.exists("requirements.txt"):
+        run_cmd("pip install -r requirements.txt")
+    else:
+        print("⚠️ Không tìm thấy requirements.txt")
+    print("\n🎉 Setup hoàn tất!")
+if __name__ == "__main__":
+    main()

zipvoice/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message="pkg_resources is deprecated as an API.*",
+)

zipvoice/bin/compute_fbank.py ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/usr/bin/env python3
+# Copyright    2024-2025  Xiaomi Corp.        (authors: Wei Kang
+#                                                       Han Zhu)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+      python3 -m zipvoice.bin.compute_fbank \
+        --source-dir data/manifests \
+        --dest-dir data/fbank \
+        --dataset libritts \
+        --subset dev-other \
+        --sampling-rate 24000 \
+        --num-jobs 20
+The input would be data/manifests/libritts-cuts_dev-other.jsonl.gz or
+    (libritts_supervisions_dev-other.jsonl.gz and librittsrecordings_dev-other.jsonl.gz)
+The output would be data/fbank/libritts-cuts_dev-other.jsonl.gz
+"""
+import argparse
+import logging
+from concurrent.futures import ProcessPoolExecutor as Pool
+from pathlib import Path
+import lhotse
+import torch
+from lhotse import CutSet, LilcomChunkyWriter, load_manifest_lazy
+from zipvoice.utils.common import str2bool
+from zipvoice.utils.feature import VocosFbank
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+lhotse.set_audio_duration_mismatch_tolerance(0.1)
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sampling-rate",
+        type=int,
+        default=24000,
+        help="The target sampling rate, the audio will be resampled to it.",
+    )
+    parser.add_argument(
+        "--type",
+        type=str,
+        default="vocos",
+        help="fbank type",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="Dataset name.",
+    )
+    parser.add_argument(
+        "--subset",
+        type=str,
+        help="The subset of the dataset.",
+    )
+    parser.add_argument(
+        "--source-dir",
+        type=str,
+        default="data/manifests",
+        help="The source directory of manifest files.",
+    )
+    parser.add_argument(
+        "--dest-dir",
+        type=str,
+        default="data/fbank",
+        help="The destination directory of manifest files.",
+    )
+    parser.add_argument(
+        "--split-cuts",
+        type=str2bool,
+        default=False,
+        help="Whether to use splited cuts.",
+    )
+    parser.add_argument(
+        "--split-begin",
+        type=int,
+        help="Start idx of splited cuts.",
+    )
+    parser.add_argument(
+        "--split-end",
+        type=int,
+        help="End idx of splited cuts.",
+    )
+    parser.add_argument(
+        "--batch-duration",
+        type=int,
+        default=1000,
+        help="The batch duration when computing the features.",
+    )
+    parser.add_argument(
+        "--num-jobs",
+        type=int,
+        default=20,
+        help="The number of extractor workers.",
+    )
+    return parser.parse_args()
+def compute_fbank_split_single(params, idx):
+    logging.info(
+        f"Computing features for {idx}-th split of "
+        f"{params.dataset} dataset {params.subset} subset"
+    )
+    lhotse.set_audio_duration_mismatch_tolerance(0.1)  # for emilia
+    src_dir = Path(params.source_dir)
+    output_dir = Path(params.dest_dir)
+    if not src_dir.exists():
+        logging.error(f"{src_dir} not exists")
+        return
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+    num_digits = 8
+    if params.type == "vocos":
+        extractor = VocosFbank()
+    else:
+        raise NotImplementedError(f"{params.type} is not supported")
+    prefix = params.dataset
+    subset = params.subset
+    suffix = "jsonl.gz"
+    idx = f"{idx}".zfill(num_digits)
+    cuts_filename = f"{prefix}_cuts_{subset}.{idx}.{suffix}"
+    if (src_dir / cuts_filename).is_file():
+        logging.info(f"Loading manifests {src_dir / cuts_filename}")
+        cut_set = load_manifest_lazy(src_dir / cuts_filename)
+    else:
+        logging.warning(f"Raw {cuts_filename} not exists, skipping")
+        return
+    cut_set = cut_set.resample(params.sampling_rate)
+    if (output_dir / cuts_filename).is_file():
+        logging.info(f"{cuts_filename} already exists - skipping.")
+        return
+    logging.info(f"Processing {subset}.{idx} of {prefix}")
+    cut_set = cut_set.compute_and_store_features_batch(
+        extractor=extractor,
+        storage_path=f"{output_dir}/{prefix}_feats_{subset}_{idx}",
+        num_workers=4,
+        batch_duration=params.batch_duration,
+        storage_type=LilcomChunkyWriter,
+        overwrite=True,
+    )
+    logging.info(f"Saving file to {output_dir / cuts_filename}")
+    cut_set.to_file(output_dir / cuts_filename)
+def compute_fbank_split(params):
+    if params.split_end < params.split_begin:
+        logging.warning(
+            f"Split begin should be smaller than split end, given "
+            f"{params.split_begin} -> {params.split_end}."
+        )
+    with Pool(max_workers=params.num_jobs) as pool:
+        futures = [
+            pool.submit(compute_fbank_split_single, params, i)
+            for i in range(params.split_begin, params.split_end)
+        ]
+        for f in futures:
+            f.result()
+            f.done()
+def compute_fbank(params):
+    logging.info(
+        f"Computing features for {params.dataset} dataset {params.subset} subset"
+    )
+    src_dir = Path(params.source_dir)
+    output_dir = Path(params.dest_dir)
+    num_jobs = params.num_jobs
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+    prefix = params.dataset
+    subset = params.subset
+    suffix = "jsonl.gz"
+    cut_set_name = f"{prefix}_cuts_{subset}.{suffix}"
+    if (src_dir / cut_set_name).is_file():
+        logging.info(f"Loading manifests {src_dir / cut_set_name}")
+        cut_set = load_manifest_lazy(src_dir / cut_set_name)
+    else:
+        recordings = load_manifest_lazy(
+            src_dir / f"{prefix}_recordings_{subset}.{suffix}"
+        )
+        supervisions = load_manifest_lazy(
+            src_dir / f"{prefix}_supervisions_{subset}.{suffix}"
+        )
+        cut_set = CutSet.from_manifests(
+            recordings=recordings,
+            supervisions=supervisions,
+        )
+    cut_set = cut_set.resample(params.sampling_rate)
+    if params.type == "vocos":
+        extractor = VocosFbank()
+    else:
+        raise NotImplementedError(f"{params.type} is not supported")
+    cuts_filename = f"{prefix}_cuts_{subset}.{suffix}"
+    if (output_dir / cuts_filename).is_file():
+        logging.info(f"{prefix} {subset} already exists - skipping.")
+        return
+    logging.info(f"Processing {subset} of {prefix}")
+    cut_set = cut_set.compute_and_store_features(
+        extractor=extractor,
+        storage_path=f"{output_dir}/{prefix}_feats_{subset}",
+        num_jobs=num_jobs,
+        storage_type=LilcomChunkyWriter,
+    )
+    logging.info(f"Saving file to {output_dir / cuts_filename}")
+    cut_set.to_file(output_dir / cuts_filename)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_args()
+    logging.info(vars(args))
+    if args.split_cuts:
+        compute_fbank_split(params=args)
+    else:
+        compute_fbank(params=args)
+    logging.info("Done!")

zipvoice/bin/generate_averaged_model.py ADDED Viewed

	@@ -0,0 +1,229 @@

+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+This script loads checkpoints and averages them.
+python3 -m zipvoice.bin.generate_averaged_model  \
+    --epoch 11 \
+    --avg 4 \
+    --model-name zipvoice \
+    --exp-dir exp/zipvoice
+It will generate a file `epoch-11-avg-14.pt` in the given `exp_dir`.
+You can later load it by `torch.load("epoch-11-avg-4.pt")`.
+"""
+import argparse
+import json
+import logging
+from pathlib import Path
+import torch
+from zipvoice.models.zipvoice import ZipVoice
+from zipvoice.models.zipvoice_dialog import ZipVoiceDialog, ZipVoiceDialogStereo
+from zipvoice.models.zipvoice_distill import ZipVoiceDistill
+from zipvoice.tokenizer.tokenizer import SimpleTokenizer
+from zipvoice.utils.checkpoint import (
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+)
+from zipvoice.utils.common import AttributeDict
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=11,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=4,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' or --iter",
+    )
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="exp/zipvoice",
+        help="The experiment dir",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="zipvoice",
+        choices=[
+            "zipvoice",
+            "zipvoice_distill",
+            "zipvoice_dialog",
+            "zipvoice_dialog_stereo",
+        ],
+        help="The model type to be averaged. ",
+    )
+    return parser
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    params = AttributeDict()
+    params.update(vars(args))
+    params.exp_dir = Path(params.exp_dir)
+    with open(params.exp_dir / "model.json", "r") as f:
+        model_config = json.load(f)
+    # Any tokenizer can be used here.
+    # Use SimpleTokenizer for simplicity.
+    tokenizer = SimpleTokenizer(token_file=params.exp_dir / "tokens.txt")
+    if params.model_name in ["zipvoice", "zipvoice_distill"]:
+        tokenizer_config = {
+            "vocab_size": tokenizer.vocab_size,
+            "pad_id": tokenizer.pad_id,
+        }
+    elif params.model_name in ["zipvoice_dialog", "zipvoice_dialog_stereo"]:
+        tokenizer_config = {
+            "vocab_size": tokenizer.vocab_size,
+            "pad_id": tokenizer.pad_id,
+            "spk_a_id": tokenizer.spk_a_id,
+            "spk_b_id": tokenizer.spk_b_id,
+        }
+    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+    logging.info("Script started")
+    params.device = torch.device("cpu")
+    logging.info(f"Device: {params.device}")
+    logging.info("About to create model")
+    if params.model_name == "zipvoice":
+        model = ZipVoice(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    elif params.model_name == "zipvoice_distill":
+        model = ZipVoiceDistill(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    elif params.model_name == "zipvoice_dialog":
+        model = ZipVoiceDialog(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    elif params.model_name == "zipvoice_dialog_stereo":
+        model = ZipVoiceDialogStereo(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    else:
+        raise ValueError(f"Unknown model name: {params.model_name}")
+    if params.iter > 0:
+        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+            : params.avg + 1
+        ]
+        if len(filenames) == 0:
+            raise ValueError(
+                f"No checkpoints found for" f" --iter {params.iter}, --avg {params.avg}"
+            )
+        elif len(filenames) < params.avg + 1:
+            raise ValueError(
+                f"Not enough checkpoints ({len(filenames)}) found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        filename_start = filenames[-1]
+        filename_end = filenames[0]
+        logging.info(
+            "Calculating the averaged model over iteration checkpoints"
+            f" from {filename_start} (excluded) to {filename_end}"
+        )
+        model.to(params.device)
+        model.load_state_dict(
+            average_checkpoints_with_averaged_model(
+                filename_start=filename_start,
+                filename_end=filename_end,
+                device=params.device,
+            ),
+            strict=True,
+        )
+    else:
+        assert params.avg > 0, params.avg
+        start = params.epoch - params.avg
+        assert start >= 1, start
+        filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+        filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+        logging.info(
+            f"Calculating the averaged model over epoch range from "
+            f"{start} (excluded) to {params.epoch}"
+        )
+        model.to(params.device)
+        model.load_state_dict(
+            average_checkpoints_with_averaged_model(
+                filename_start=filename_start,
+                filename_end=filename_end,
+                device=params.device,
+            ),
+            strict=True,
+        )
+    if params.iter > 0:
+        filename = params.exp_dir / f"iter-{params.iter}-avg-{params.avg}.pt"
+    else:
+        filename = params.exp_dir / f"epoch-{params.epoch}-avg-{params.avg}.pt"
+    logging.info(f"Saving the averaged checkpoint to {filename}")
+    torch.save({"model": model.state_dict()}, filename)
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+    logging.info("Done!")
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    main()

zipvoice/bin/infer_zipvoice.py ADDED Viewed

	@@ -0,0 +1,614 @@

+#!/usr/bin/env python3
+# Copyright         2025  Xiaomi Corp.        (authors: Han Zhu)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script generates speech with our pre-trained ZipVoice or
+    ZipVoice-Distill models. If no local model is specified,
+    Required files will be automatically downloaded from HuggingFace.
+Usage:
+Note: If you having trouble connecting to HuggingFace,
+    try switching endpoint to mirror site:
+export HF_ENDPOINT=https://hf-mirror.com
+(1) Inference of a single sentence:
+python3 -m zipvoice.bin.infer_zipvoice \
+    --model-name zipvoice \
+    --prompt-wav prompt.wav \
+    --prompt-text "I am a prompt." \
+    --text "I am a sentence." \
+    --res-wav-path result.wav
+(2) Inference of a list of sentences:
+python3 -m zipvoice.bin.infer_zipvoice \
+    --model-name zipvoice \
+    --test-list test.tsv \
+    --res-dir results
+`--model-name` can be `zipvoice` or `zipvoice_distill`,
+    which are the models before and after distillation, respectively.
+Each line of `test.tsv` is in the format of
+    `{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}`.
+"""
+import argparse
+import datetime as dt
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import safetensors.torch
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from lhotse.utils import fix_random_seed
+from vocos import Vocos
+from zipvoice.models.zipvoice import ZipVoice
+from zipvoice.models.zipvoice_distill import ZipVoiceDistill
+from zipvoice.tokenizer.tokenizer import (
+    EmiliaTokenizer,
+    EspeakTokenizer,
+    LibriTTSTokenizer,
+    SimpleTokenizer,
+)
+from zipvoice.utils.checkpoint import load_checkpoint
+from zipvoice.utils.common import AttributeDict
+from zipvoice.utils.feature import VocosFbank
+HUGGINGFACE_REPO = "k2-fsa/ZipVoice"
+MODEL_DIR = {
+    "zipvoice": "zipvoice",
+    "zipvoice_distill": "zipvoice_distill",
+}
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="zipvoice",
+        choices=["zipvoice", "zipvoice_distill"],
+        help="The model used for inference",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="The model directory that contains model checkpoint, configuration "
+        "file model.json, and tokens file tokens.txt. Will download pre-trained "
+        "checkpoint from huggingface if not specified.",
+    )
+    parser.add_argument(
+        "--checkpoint-name",
+        type=str,
+        default="model.pt",
+        help="The name of model checkpoint.",
+    )
+    parser.add_argument(
+        "--vocoder-path",
+        type=str,
+        default=None,
+        help="The vocoder checkpoint. "
+        "Will download pre-trained vocoder from huggingface if not specified.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="emilia",
+        choices=["emilia", "libritts", "espeak", "simple"],
+        help="Tokenizer type.",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default="en-us",
+        help="Language identifier, used when tokenizer type is espeak. see"
+        "https://github.com/rhasspy/espeak-ng/blob/master/docs/languages.md",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default=None,
+        help="The list of prompt speech, prompt_transcription, "
+        "and text to synthesizein the format of "
+        "'{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}'.",
+    )
+    parser.add_argument(
+        "--prompt-wav",
+        type=str,
+        default=None,
+        help="The prompt wav to mimic",
+    )
+    parser.add_argument(
+        "--prompt-text",
+        type=str,
+        default=None,
+        help="The transcription of the prompt wav",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        default=None,
+        help="The text to synthesize",
+    )
+    parser.add_argument(
+        "--res-dir",
+        type=str,
+        default="results",
+        help="""
+        Path name of the generated wavs dir,
+        used when test-list is not None
+        """,
+    )
+    parser.add_argument(
+        "--res-wav-path",
+        type=str,
+        default="result.wav",
+        help="""
+        Path name of the generated wav path,
+        used when test-list is None
+        """,
+    )
+    parser.add_argument(
+        "--guidance-scale",
+        type=float,
+        default=None,
+        help="The scale of classifier-free guidance during inference.",
+    )
+    parser.add_argument(
+        "--num-step",
+        type=int,
+        default=None,
+        help="The number of sampling steps.",
+    )
+    parser.add_argument(
+        "--feat-scale",
+        type=float,
+        default=0.1,
+        help="The scale factor of fbank feature",
+    )
+    parser.add_argument(
+        "--speed",
+        type=float,
+        default=1.0,
+        help="Control speech speed, 1.0 means normal, >1.0 means speed up",
+    )
+    parser.add_argument(
+        "--t-shift",
+        type=float,
+        default=0.5,
+        help="Shift t to smaller ones if t_shift < 1.0",
+    )
+    parser.add_argument(
+        "--target-rms",
+        type=float,
+        default=0.1,
+        help="Target speech normalization rms value, set to 0 to disable normalization",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=666,
+        help="Random seed",
+    )
+    return parser
+def get_vocoder(vocos_local_path: Optional[str] = None):
+    if vocos_local_path:
+        vocoder = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
+        state_dict = torch.load(
+            f"{vocos_local_path}/pytorch_model.bin",
+            weights_only=True,
+            map_location="cpu",
+        )
+        vocoder.load_state_dict(state_dict)
+    else:
+        vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+    return vocoder
+def generate_sentence(
+    save_path: str,
+    prompt_text: str,
+    prompt_wav: str,
+    text: str,
+    model: torch.nn.Module,
+    vocoder: torch.nn.Module,
+    tokenizer: EmiliaTokenizer,
+    feature_extractor: VocosFbank,
+    device: torch.device,
+    num_step: int = 16,
+    guidance_scale: float = 1.0,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    target_rms: float = 0.1,
+    feat_scale: float = 0.1,
+    sampling_rate: int = 24000,
+):
+    """
+    Generate waveform of a text based on a given prompt
+        waveform and its transcription.
+    Args:
+        save_path (str): Path to save the generated wav.
+        prompt_text (str): Transcription of the prompt wav.
+        prompt_wav (str): Path to the prompt wav file.
+        text (str): Text to be synthesized into a waveform.
+        model (torch.nn.Module): The model used for generation.
+        vocoder (torch.nn.Module): The vocoder used to convert features to waveforms.
+        tokenizer (EmiliaTokenizer): The tokenizer used to convert text to tokens.
+        feature_extractor (VocosFbank): The feature extractor used to
+            extract acoustic features.
+        device (torch.device): The device on which computations are performed.
+        num_step (int, optional): Number of steps for decoding. Defaults to 16.
+        guidance_scale (float, optional): Scale for classifier-free guidance.
+            Defaults to 1.0.
+        speed (float, optional): Speed control. Defaults to 1.0.
+        t_shift (float, optional): Time shift. Defaults to 0.5.
+        target_rms (float, optional): Target RMS for waveform normalization.
+            Defaults to 0.1.
+        feat_scale (float, optional): Scale for features.
+            Defaults to 0.1.
+        sampling_rate (int, optional): Sampling rate for the waveform.
+            Defaults to 24000.
+    Returns:
+        metrics (dict): Dictionary containing time and real-time
+            factor metrics for processing.
+    """
+    # Convert text to tokens
+    tokens = tokenizer.texts_to_token_ids([text])
+    prompt_tokens = tokenizer.texts_to_token_ids([prompt_text])
+    # Load and preprocess prompt wav
+    prompt_wav, prompt_sampling_rate = torchaudio.load(prompt_wav)
+    if prompt_sampling_rate != sampling_rate:
+        resampler = torchaudio.transforms.Resample(
+            orig_freq=prompt_sampling_rate, new_freq=sampling_rate
+        )
+        prompt_wav = resampler(prompt_wav)
+    prompt_rms = torch.sqrt(torch.mean(torch.square(prompt_wav)))
+    if prompt_rms < target_rms:
+        prompt_wav = prompt_wav * target_rms / prompt_rms
+    # Extract features from prompt wav
+    prompt_features = feature_extractor.extract(
+        prompt_wav, sampling_rate=sampling_rate
+    ).to(device)
+    prompt_features = prompt_features.unsqueeze(0) * feat_scale
+    prompt_features_lens = torch.tensor([prompt_features.size(1)], device=device)
+    # Start timing
+    start_t = dt.datetime.now()
+    # Generate features
+    (
+        pred_features,
+        pred_features_lens,
+        pred_prompt_features,
+        pred_prompt_features_lens,
+    ) = model.sample(
+        tokens=tokens,
+        prompt_tokens=prompt_tokens,
+        prompt_features=prompt_features,
+        prompt_features_lens=prompt_features_lens,
+        speed=speed,
+        t_shift=t_shift,
+        duration="predict",
+        num_step=num_step,
+        guidance_scale=guidance_scale,
+    )
+    # Postprocess predicted features
+    pred_features = pred_features.permute(0, 2, 1) / feat_scale  # (B, C, T)
+    # Start vocoder processing
+    start_vocoder_t = dt.datetime.now()
+    wav = vocoder.decode(pred_features).squeeze(1).clamp(-1, 1)
+    # Calculate processing times and real-time factors
+    t = (dt.datetime.now() - start_t).total_seconds()
+    t_no_vocoder = (start_vocoder_t - start_t).total_seconds()
+    t_vocoder = (dt.datetime.now() - start_vocoder_t).total_seconds()
+    wav_seconds = wav.shape[-1] / sampling_rate
+    rtf = t / wav_seconds
+    rtf_no_vocoder = t_no_vocoder / wav_seconds
+    rtf_vocoder = t_vocoder / wav_seconds
+    metrics = {
+        "t": t,
+        "t_no_vocoder": t_no_vocoder,
+        "t_vocoder": t_vocoder,
+        "wav_seconds": wav_seconds,
+        "rtf": rtf,
+        "rtf_no_vocoder": rtf_no_vocoder,
+        "rtf_vocoder": rtf_vocoder,
+    }
+    # Adjust wav volume if necessary
+    if prompt_rms < target_rms:
+        wav = wav * prompt_rms / target_rms
+    torchaudio.save(save_path, wav.cpu(), sample_rate=sampling_rate)
+    return metrics
+def generate_list(
+    res_dir: str,
+    test_list: str,
+    model: torch.nn.Module,
+    vocoder: torch.nn.Module,
+    tokenizer: EmiliaTokenizer,
+    feature_extractor: VocosFbank,
+    device: torch.device,
+    num_step: int = 16,
+    guidance_scale: float = 1.0,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    target_rms: float = 0.1,
+    feat_scale: float = 0.1,
+    sampling_rate: int = 24000,
+):
+    total_t = []
+    total_t_no_vocoder = []
+    total_t_vocoder = []
+    total_wav_seconds = []
+    with open(test_list, "r") as fr:
+        lines = fr.readlines()
+    for i, line in enumerate(lines):
+        wav_name, prompt_text, prompt_wav, text = line.strip().split("\t")
+        save_path = f"{res_dir}/{wav_name}.wav"
+        metrics = generate_sentence(
+            save_path=save_path,
+            prompt_text=prompt_text,
+            prompt_wav=prompt_wav,
+            text=text,
+            model=model,
+            vocoder=vocoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            device=device,
+            num_step=num_step,
+            guidance_scale=guidance_scale,
+            speed=speed,
+            t_shift=t_shift,
+            target_rms=target_rms,
+            feat_scale=feat_scale,
+            sampling_rate=sampling_rate,
+        )
+        logging.info(f"[Sentence: {i}] RTF: {metrics['rtf']:.4f}")
+        total_t.append(metrics["t"])
+        total_t_no_vocoder.append(metrics["t_no_vocoder"])
+        total_t_vocoder.append(metrics["t_vocoder"])
+        total_wav_seconds.append(metrics["wav_seconds"])
+    logging.info(f"Average RTF: {np.sum(total_t) / np.sum(total_wav_seconds):.4f}")
+    logging.info(
+        f"Average RTF w/o vocoder: "
+        f"{np.sum(total_t_no_vocoder) / np.sum(total_wav_seconds):.4f}"
+    )
+    logging.info(
+        f"Average RTF vocoder: "
+        f"{np.sum(total_t_vocoder) / np.sum(total_wav_seconds):.4f}"
+    )
+@torch.inference_mode()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    params = AttributeDict()
+    params.update(vars(args))
+    fix_random_seed(params.seed)
+    model_defaults = {
+        "zipvoice": {
+            "num_step": 16,
+            "guidance_scale": 1.0,
+        },
+        "zipvoice_distill": {
+            "num_step": 8,
+            "guidance_scale": 3.0,
+        },
+    }
+    model_specific_defaults = model_defaults.get(params.model_name, {})
+    for param, value in model_specific_defaults.items():
+        if getattr(params, param) is None:
+            setattr(params, param, value)
+            logging.info(f"Setting {param} to default value: {value}")
+    assert (params.test_list is not None) ^ (
+        (params.prompt_wav and params.prompt_text and params.text) is not None
+    ), (
+        "For inference, please provide prompts and text with either '--test-list'"
+        " or '--prompt-wav, --prompt-text and --text'."
+    )
+    if params.model_dir is not None:
+        params.model_dir = Path(params.model_dir)
+        if not params.model_dir.is_dir():
+            raise FileNotFoundError(f"{params.model_dir} does not exist")
+        for filename in [params.checkpoint_name, "model.json", "tokens.txt"]:
+            if not (params.model_dir / filename).is_file():
+                raise FileNotFoundError(f"{params.model_dir / filename} does not exist")
+        model_ckpt = params.model_dir / params.checkpoint_name
+        model_config = params.model_dir / "model.json"
+        token_file = params.model_dir / "tokens.txt"
+        logging.info(
+            f"Using local model dir {params.model_dir}, "
+            f"checkpoint {params.checkpoint_name}"
+        )
+    else:
+        logging.info("Using pretrained model from the huggingface")
+        logging.info("Downloading the requires files from HuggingFace")
+        model_ckpt = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/model.pt"
+        )
+        model_config = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/model.json"
+        )
+        token_file = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/tokens.txt"
+        )
+    logging.info("Loading model...")
+    if params.tokenizer == "emilia":
+        tokenizer = EmiliaTokenizer(token_file=token_file)
+    elif params.tokenizer == "libritts":
+        tokenizer = LibriTTSTokenizer(token_file=token_file)
+    elif params.tokenizer == "espeak":
+        tokenizer = EspeakTokenizer(token_file=token_file, lang=params.lang)
+    else:
+        assert params.tokenizer == "simple"
+        tokenizer = SimpleTokenizer(token_file=token_file)
+    tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
+    with open(model_config, "r") as f:
+        model_config = json.load(f)
+    if params.model_name == "zipvoice":
+        model = ZipVoice(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    else:
+        assert params.model_name == "zipvoice_distill"
+        model = ZipVoiceDistill(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    if str(model_ckpt).endswith(".safetensors"):
+        safetensors.torch.load_model(model, model_ckpt)
+    elif str(model_ckpt).endswith(".pt"):
+        load_checkpoint(filename=model_ckpt, model=model, strict=True)
+    else:
+        raise NotImplementedError(f"Unsupported model checkpoint format: {model_ckpt}")
+    if torch.cuda.is_available():
+        params.device = torch.device("cuda", 0)
+    elif torch.backends.mps.is_available():
+        params.device = torch.device("mps")
+    else:
+        params.device = torch.device("cpu")
+    logging.info(f"Device: {params.device}")
+    model = model.to(params.device)
+    model.eval()
+    vocoder = get_vocoder(params.vocoder_path)
+    vocoder = vocoder.to(params.device)
+    vocoder.eval()
+    if model_config["feature"]["type"] == "vocos":
+        feature_extractor = VocosFbank()
+    else:
+        raise NotImplementedError(
+            f"Unsupported feature type: {model_config['feature']['type']}"
+        )
+    params.sampling_rate = model_config["feature"]["sampling_rate"]
+    logging.info("Start generating...")
+    if params.test_list:
+        os.makedirs(params.res_dir, exist_ok=True)
+        generate_list(
+            res_dir=params.res_dir,
+            test_list=params.test_list,
+            model=model,
+            vocoder=vocoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            device=params.device,
+            num_step=params.num_step,
+            guidance_scale=params.guidance_scale,
+            speed=params.speed,
+            t_shift=params.t_shift,
+            target_rms=params.target_rms,
+            feat_scale=params.feat_scale,
+            sampling_rate=params.sampling_rate,
+        )
+    else:
+        generate_sentence(
+            save_path=params.res_wav_path,
+            prompt_text=params.prompt_text,
+            prompt_wav=params.prompt_wav,
+            text=params.text,
+            model=model,
+            vocoder=vocoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            device=params.device,
+            num_step=params.num_step,
+            guidance_scale=params.guidance_scale,
+            speed=params.speed,
+            t_shift=params.t_shift,
+            target_rms=params.target_rms,
+            feat_scale=params.feat_scale,
+            sampling_rate=params.sampling_rate,
+        )
+    logging.info("Done")
+if __name__ == "__main__":
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    main()

zipvoice/bin/infer_zipvoice_dialog.py ADDED Viewed

	@@ -0,0 +1,756 @@

+#!/usr/bin/env python3
+# Copyright         2025  Xiaomi Corp.        (authors: Han Zhu)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script generates speech with our pre-trained ZipVoice-Dialog or
+    ZipVoice-Dialog-Stereo models. If no local model is specified,
+    Required files will be automatically downloaded from HuggingFace.
+Usage:
+Note: If you having trouble connecting to HuggingFace,
+    try switching endpoint to mirror site:
+export HF_ENDPOINT=https://hf-mirror.com
+python3 -m zipvoice.bin.infer_zipvoice_dialog \
+    --model-name zipvoice_dialog \
+    --test-list test.tsv \
+    --res-dir results
+`--model-name` can be `zipvoice_dialog` or `zipvoice_dialog_stereo`,
+    which generate mono and stereo dialogues, respectively.
+Each line of `test.tsv` is in the format of merged conversation:
+    '{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}'
+    or splited conversation:
+    '{wav_name}\t{spk1_prompt_transcription}\t{spk2_prompt_transcription}
+        \t{spk1_prompt_wav}\t{spk2_prompt_wav}\t{text}'
+"""
+import argparse
+import datetime as dt
+import json
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional, Union
+import numpy as np
+import safetensors.torch
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from lhotse.utils import fix_random_seed
+from vocos import Vocos
+from zipvoice.models.zipvoice_dialog import ZipVoiceDialog, ZipVoiceDialogStereo
+from zipvoice.tokenizer.tokenizer import DialogTokenizer
+from zipvoice.utils.checkpoint import load_checkpoint
+from zipvoice.utils.common import AttributeDict
+from zipvoice.utils.feature import VocosFbank
+HUGGINGFACE_REPO = "k2-fsa/ZipVoice"
+MODEL_DIR = {
+    "zipvoice_dialog": "zipvoice_dialog",
+    "zipvoice_dialog_stereo": "zipvoice_dialog_stereo",
+}
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="zipvoice_dialog",
+        choices=["zipvoice_dialog", "zipvoice_dialog_stereo"],
+        help="The model used for inference",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="The model directory that contains model checkpoint, configuration "
+        "file model.json, and tokens file tokens.txt. Will download pre-trained "
+        "checkpoint from huggingface if not specified.",
+    )
+    parser.add_argument(
+        "--checkpoint-name",
+        type=str,
+        default="model.pt",
+        help="The name of model checkpoint.",
+    )
+    parser.add_argument(
+        "--vocoder-path",
+        type=str,
+        default=None,
+        help="The vocoder checkpoint. "
+        "Will download pre-trained vocoder from huggingface if not specified.",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default=None,
+        help="The list of prompt speech, prompt_transcription, "
+        "and text to synthesizein the format of merged conversation: "
+        "'{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}' "
+        "or splited conversation: "
+        "'{wav_name}\t{spk1_prompt_transcription}\t{spk2_prompt_transcription}"
+        "\t{spk1_prompt_wav}\t{spk2_prompt_wav}\t{text}'.",
+    )
+    parser.add_argument(
+        "--res-dir",
+        type=str,
+        default="results",
+        help="""
+        Path name of the generated wavs dir,
+        used when test-list is not None
+        """,
+    )
+    parser.add_argument(
+        "--guidance-scale",
+        type=float,
+        default=1.5,
+        help="The scale of classifier-free guidance during inference.",
+    )
+    parser.add_argument(
+        "--num-step",
+        type=int,
+        default=16,
+        help="The number of sampling steps.",
+    )
+    parser.add_argument(
+        "--feat-scale",
+        type=float,
+        default=0.1,
+        help="The scale factor of fbank feature",
+    )
+    parser.add_argument(
+        "--speed",
+        type=float,
+        default=1.0,
+        help="Control speech speed, 1.0 means normal, >1.0 means speed up",
+    )
+    parser.add_argument(
+        "--t-shift",
+        type=float,
+        default=0.5,
+        help="Shift t to smaller ones if t_shift < 1.0",
+    )
+    parser.add_argument(
+        "--target-rms",
+        type=float,
+        default=0.1,
+        help="Target speech normalization rms value, set to 0 to disable normalization",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=666,
+        help="Random seed",
+    )
+    parser.add_argument(
+        "--silence-wav",
+        type=str,
+        default="assets/silence.wav",
+        help="Path of the silence wav file, used in two-channel generation "
+        "with single-channel prompts",
+    )
+    return parser
+def get_vocoder(vocos_local_path: Optional[str] = None):
+    if vocos_local_path:
+        vocoder = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
+        state_dict = torch.load(
+            f"{vocos_local_path}/pytorch_model.bin",
+            weights_only=True,
+            map_location="cpu",
+        )
+        vocoder.load_state_dict(state_dict)
+    else:
+        vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+    return vocoder
+def generate_sentence(
+    save_path: str,
+    prompt_text: str,
+    prompt_wav: Union[str, List[str]],
+    text: str,
+    model: torch.nn.Module,
+    vocoder: torch.nn.Module,
+    tokenizer: DialogTokenizer,
+    feature_extractor: VocosFbank,
+    device: torch.device,
+    num_step: int = 16,
+    guidance_scale: float = 1.0,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    target_rms: float = 0.1,
+    feat_scale: float = 0.1,
+    sampling_rate: int = 24000,
+):
+    """
+    Generate waveform of a text based on a given prompt
+        waveform and its transcription.
+    Args:
+        save_path (str): Path to save the generated wav.
+        prompt_text (str): Transcription of the prompt wav.
+        prompt_wav (Union[str, List[str]]): Path to the prompt wav file, can be
+            one or two wav files, which corresponding to a merged conversational
+            speech or two seperate speaker's speech.
+        text (str): Text to be synthesized into a waveform.
+        model (torch.nn.Module): The model used for generation.
+        vocoder (torch.nn.Module): The vocoder used to convert features to waveforms.
+        tokenizer (DialogTokenizer): The tokenizer used to convert text to tokens.
+        feature_extractor (VocosFbank): The feature extractor used to
+            extract acoustic features.
+        device (torch.device): The device on which computations are performed.
+        num_step (int, optional): Number of steps for decoding. Defaults to 16.
+        guidance_scale (float, optional): Scale for classifier-free guidance.
+            Defaults to 1.0.
+        speed (float, optional): Speed control. Defaults to 1.0.
+        t_shift (float, optional): Time shift. Defaults to 0.5.
+        target_rms (float, optional): Target RMS for waveform normalization.
+            Defaults to 0.1.
+        feat_scale (float, optional): Scale for features.
+            Defaults to 0.1.
+        sampling_rate (int, optional): Sampling rate for the waveform.
+            Defaults to 24000.
+    Returns:
+        metrics (dict): Dictionary containing time and real-time
+            factor metrics for processing.
+    """
+    # Convert text to tokens
+    tokens = tokenizer.texts_to_token_ids([text])
+    prompt_tokens = tokenizer.texts_to_token_ids([prompt_text])
+    # Load and preprocess prompt wav
+    if isinstance(prompt_wav, str):
+        prompt_wav = [
+            prompt_wav,
+        ]
+    else:
+        assert len(prompt_wav) == 2 and isinstance(prompt_wav[0], str)
+    loaded_prompt_wavs = prompt_wav
+    for i in range(len(prompt_wav)):
+        loaded_prompt_wavs[i], prompt_sampling_rate = torchaudio.load(prompt_wav[i])
+        if prompt_sampling_rate != sampling_rate:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=prompt_sampling_rate, new_freq=sampling_rate
+            )
+            loaded_prompt_wavs[i] = resampler(loaded_prompt_wavs[i])
+        if loaded_prompt_wavs[i].size(0) != 1:
+            loaded_prompt_wavs[i] = loaded_prompt_wavs[i].mean(0, keepdim=True)
+    if len(loaded_prompt_wavs) == 1:
+        prompt_wav = loaded_prompt_wavs[0]
+    else:
+        prompt_wav = torch.cat(loaded_prompt_wavs, dim=1)
+    prompt_rms = torch.sqrt(torch.mean(torch.square(prompt_wav)))
+    if prompt_rms < target_rms:
+        prompt_wav = prompt_wav * target_rms / prompt_rms
+    # Extract features from prompt wav
+    prompt_features = feature_extractor.extract(
+        prompt_wav, sampling_rate=sampling_rate
+    ).to(device)
+    prompt_features = prompt_features.unsqueeze(0) * feat_scale
+    prompt_features_lens = torch.tensor([prompt_features.size(1)], device=device)
+    # Start timing
+    start_t = dt.datetime.now()
+    # Generate features
+    (
+        pred_features,
+        pred_features_lens,
+        pred_prompt_features,
+        pred_prompt_features_lens,
+    ) = model.sample(
+        tokens=tokens,
+        prompt_tokens=prompt_tokens,
+        prompt_features=prompt_features,
+        prompt_features_lens=prompt_features_lens,
+        speed=speed,
+        t_shift=t_shift,
+        duration="predict",
+        num_step=num_step,
+        guidance_scale=guidance_scale,
+    )
+    # Postprocess predicted features
+    pred_features = pred_features.permute(0, 2, 1) / feat_scale  # (B, C, T)
+    # Start vocoder processing
+    start_vocoder_t = dt.datetime.now()
+    wav = vocoder.decode(pred_features).squeeze(1).clamp(-1, 1)
+    # Calculate processing times and real-time factors
+    t = (dt.datetime.now() - start_t).total_seconds()
+    t_no_vocoder = (start_vocoder_t - start_t).total_seconds()
+    t_vocoder = (dt.datetime.now() - start_vocoder_t).total_seconds()
+    wav_seconds = wav.shape[-1] / sampling_rate
+    rtf = t / wav_seconds
+    rtf_no_vocoder = t_no_vocoder / wav_seconds
+    rtf_vocoder = t_vocoder / wav_seconds
+    metrics = {
+        "t": t,
+        "t_no_vocoder": t_no_vocoder,
+        "t_vocoder": t_vocoder,
+        "wav_seconds": wav_seconds,
+        "rtf": rtf,
+        "rtf_no_vocoder": rtf_no_vocoder,
+        "rtf_vocoder": rtf_vocoder,
+    }
+    # Adjust wav volume if necessary
+    if prompt_rms < target_rms:
+        wav = wav * prompt_rms / target_rms
+    torchaudio.save(save_path, wav.cpu(), sample_rate=sampling_rate)
+    return metrics
+def generate_sentence_stereo(
+    save_path: str,
+    prompt_text: str,
+    prompt_wav: Union[str, List[str]],
+    text: str,
+    model: torch.nn.Module,
+    vocoder: torch.nn.Module,
+    tokenizer: DialogTokenizer,
+    feature_extractor: VocosFbank,
+    device: torch.device,
+    num_step: int = 16,
+    guidance_scale: float = 1.0,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    target_rms: float = 0.1,
+    feat_scale: float = 0.1,
+    sampling_rate: int = 24000,
+    silence_wav: Optional[str] = None,
+):
+    """
+    Generate waveform of a text based on a given prompt
+        waveform and its transcription.
+    Args:
+        save_path (str): Path to save the generated wav.
+        prompt_text (str): Transcription of the prompt wav.
+        prompt_wav (Union[str, List[str]]): Path to the prompt wav file, can be
+            one or two wav files, which corresponding to a merged conversational
+            speech or two seperate speaker's speech.
+        text (str): Text to be synthesized into a waveform.
+        model (torch.nn.Module): The model used for generation.
+        vocoder (torch.nn.Module): The vocoder used to convert features to waveforms.
+        tokenizer (DialogTokenizer): The tokenizer used to convert text to tokens.
+        feature_extractor (VocosFbank): The feature extractor used to
+            extract acoustic features.
+        device (torch.device): The device on which computations are performed.
+        num_step (int, optional): Number of steps for decoding. Defaults to 16.
+        guidance_scale (float, optional): Scale for classifier-free guidance.
+            Defaults to 1.0.
+        speed (float, optional): Speed control. Defaults to 1.0.
+        t_shift (float, optional): Time shift. Defaults to 0.5.
+        target_rms (float, optional): Target RMS for waveform normalization.
+            Defaults to 0.1.
+        feat_scale (float, optional): Scale for features.
+            Defaults to 0.1.
+        sampling_rate (int, optional): Sampling rate for the waveform.
+            Defaults to 24000.
+        silence_wav (str): Path of the silence wav file, used in two-channel
+            generation with single-channel prompts
+    Returns:
+        metrics (dict): Dictionary containing time and real-time
+            factor metrics for processing.
+    """
+    # Convert text to tokens
+    tokens = tokenizer.texts_to_token_ids([text])
+    prompt_tokens = tokenizer.texts_to_token_ids([prompt_text])
+    # Load and preprocess prompt wav
+    if isinstance(prompt_wav, str):
+        prompt_wav = [
+            prompt_wav,
+        ]
+    else:
+        assert len(prompt_wav) == 2 and isinstance(prompt_wav[0], str)
+    loaded_prompt_wavs = prompt_wav
+    for i in range(len(prompt_wav)):
+        loaded_prompt_wavs[i], prompt_sampling_rate = torchaudio.load(prompt_wav[i])
+        if prompt_sampling_rate != sampling_rate:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=prompt_sampling_rate, new_freq=sampling_rate
+            )
+            loaded_prompt_wavs[i] = resampler(loaded_prompt_wavs[i])
+    if len(loaded_prompt_wavs) == 1:
+        assert (
+            loaded_prompt_wavs[0].size(0) == 2
+        ), "Merged prompt wav must be stereo for stereo dialogue generation"
+        prompt_wav = loaded_prompt_wavs[0]
+    else:
+        assert len(loaded_prompt_wavs) == 2
+        if loaded_prompt_wavs[0].size(0) == 2:
+            prompt_wav = torch.cat(loaded_prompt_wavs, dim=1)
+        else:
+            assert loaded_prompt_wavs[0].size(0) == 1
+            silence_wav, silence_sampling_rate = torchaudio.load(silence_wav)
+            assert silence_sampling_rate == sampling_rate
+            prompt_wav = silence_wav[
+                :, : loaded_prompt_wavs[0].size(1) + loaded_prompt_wavs[1].size(1)
+            ]
+            prompt_wav[0, : loaded_prompt_wavs[0].size(1)] = loaded_prompt_wavs[0]
+            prompt_wav[1, loaded_prompt_wavs[0].size(1) :] = loaded_prompt_wavs[1]
+    prompt_rms = torch.sqrt(torch.mean(torch.square(prompt_wav)))
+    if prompt_rms < target_rms:
+        prompt_wav = prompt_wav * target_rms / prompt_rms
+    # Extract features from prompt wav
+    prompt_features = feature_extractor.extract(
+        prompt_wav, sampling_rate=sampling_rate
+    ).to(device)
+    prompt_features = prompt_features.unsqueeze(0) * feat_scale
+    prompt_features_lens = torch.tensor([prompt_features.size(1)], device=device)
+    # Start timing
+    start_t = dt.datetime.now()
+    # Generate features
+    (
+        pred_features,
+        pred_features_lens,
+        pred_prompt_features,
+        pred_prompt_features_lens,
+    ) = model.sample(
+        tokens=tokens,
+        prompt_tokens=prompt_tokens,
+        prompt_features=prompt_features,
+        prompt_features_lens=prompt_features_lens,
+        speed=speed,
+        t_shift=t_shift,
+        duration="predict",
+        num_step=num_step,
+        guidance_scale=guidance_scale,
+    )
+    # Postprocess predicted features
+    pred_features = pred_features.permute(0, 2, 1) / feat_scale  # (B, C, T)
+    # Start vocoder processing
+    start_vocoder_t = dt.datetime.now()
+    feat_dim = pred_features.size(1) // 2
+    wav_left = vocoder.decode(pred_features[:, :feat_dim]).squeeze(1).clamp(-1, 1)
+    wav_right = (
+        vocoder.decode(pred_features[:, feat_dim : feat_dim * 2])
+        .squeeze(1)
+        .clamp(-1, 1)
+    )
+    wav = torch.cat([wav_left, wav_right], dim=0)
+    # Calculate processing times and real-time factors
+    t = (dt.datetime.now() - start_t).total_seconds()
+    t_no_vocoder = (start_vocoder_t - start_t).total_seconds()
+    t_vocoder = (dt.datetime.now() - start_vocoder_t).total_seconds()
+    wav_seconds = wav.shape[-1] / sampling_rate
+    rtf = t / wav_seconds
+    rtf_no_vocoder = t_no_vocoder / wav_seconds
+    rtf_vocoder = t_vocoder / wav_seconds
+    metrics = {
+        "t": t,
+        "t_no_vocoder": t_no_vocoder,
+        "t_vocoder": t_vocoder,
+        "wav_seconds": wav_seconds,
+        "rtf": rtf,
+        "rtf_no_vocoder": rtf_no_vocoder,
+        "rtf_vocoder": rtf_vocoder,
+    }
+    # Adjust wav volume if necessary
+    if prompt_rms < target_rms:
+        wav = wav * prompt_rms / target_rms
+    torchaudio.save(save_path, wav.cpu(), sample_rate=sampling_rate)
+    return metrics
+def generate_list(
+    model_name: str,
+    res_dir: str,
+    test_list: str,
+    model: torch.nn.Module,
+    vocoder: torch.nn.Module,
+    tokenizer: DialogTokenizer,
+    feature_extractor: VocosFbank,
+    device: torch.device,
+    num_step: int = 16,
+    guidance_scale: float = 1.5,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    target_rms: float = 0.1,
+    feat_scale: float = 0.1,
+    sampling_rate: int = 24000,
+    silence_wav: Optional[str] = None,
+):
+    total_t = []
+    total_t_no_vocoder = []
+    total_t_vocoder = []
+    total_wav_seconds = []
+    with open(test_list, "r") as fr:
+        lines = fr.readlines()
+    for i, line in enumerate(lines):
+        items = line.strip().split("\t")
+        if len(items) == 6:
+            (
+                wav_name,
+                prompt_text_1,
+                prompt_text_2,
+                prompt_wav_1,
+                prompt_wav_2,
+                text,
+            ) = items
+            prompt_text = f"[S1]{prompt_text_1}[S2]{prompt_text_2}"
+            prompt_wav = [prompt_wav_1, prompt_wav_2]
+        elif len(items) == 4:
+            wav_name, prompt_text, prompt_wav, text = items
+        else:
+            raise ValueError(f"Invalid line: {line}")
+        assert text.startswith("[S1]")
+        save_path = f"{res_dir}/{wav_name}.wav"
+        if model_name == "zipvoice_dialog":
+            metrics = generate_sentence(
+                save_path=save_path,
+                prompt_text=prompt_text,
+                prompt_wav=prompt_wav,
+                text=text,
+                model=model,
+                vocoder=vocoder,
+                tokenizer=tokenizer,
+                feature_extractor=feature_extractor,
+                device=device,
+                num_step=num_step,
+                guidance_scale=guidance_scale,
+                speed=speed,
+                t_shift=t_shift,
+                target_rms=target_rms,
+                feat_scale=feat_scale,
+                sampling_rate=sampling_rate,
+            )
+        else:
+            assert model_name == "zipvoice_dialog_stereo"
+            metrics = generate_sentence_stereo(
+                save_path=save_path,
+                prompt_text=prompt_text,
+                prompt_wav=prompt_wav,
+                text=text,
+                model=model,
+                vocoder=vocoder,
+                tokenizer=tokenizer,
+                feature_extractor=feature_extractor,
+                device=device,
+                num_step=num_step,
+                guidance_scale=guidance_scale,
+                speed=speed,
+                t_shift=t_shift,
+                target_rms=target_rms,
+                feat_scale=feat_scale,
+                sampling_rate=sampling_rate,
+                silence_wav=silence_wav,
+            )
+        logging.info(f"[Sentence: {i}] RTF: {metrics['rtf']:.4f}")
+        total_t.append(metrics["t"])
+        total_t_no_vocoder.append(metrics["t_no_vocoder"])
+        total_t_vocoder.append(metrics["t_vocoder"])
+        total_wav_seconds.append(metrics["wav_seconds"])
+    logging.info(f"Average RTF: {np.sum(total_t) / np.sum(total_wav_seconds):.4f}")
+    logging.info(
+        f"Average RTF w/o vocoder: "
+        f"{np.sum(total_t_no_vocoder) / np.sum(total_wav_seconds):.4f}"
+    )
+    logging.info(
+        f"Average RTF vocoder: "
+        f"{np.sum(total_t_vocoder) / np.sum(total_wav_seconds):.4f}"
+    )
+@torch.inference_mode()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    params = AttributeDict()
+    params.update(vars(args))
+    fix_random_seed(params.seed)
+    assert (
+        params.test_list is not None
+    ), "For inference, please provide prompts and text with '--test-list'"
+    if params.model_dir is not None:
+        params.model_dir = Path(params.model_dir)
+        if not params.model_dir.is_dir():
+            raise FileNotFoundError(f"{params.model_dir} does not exist")
+        for filename in [params.checkpoint_name, "model.json", "tokens.txt"]:
+            if not (params.model_dir / filename).is_file():
+                raise FileNotFoundError(f"{params.model_dir / filename} does not exist")
+        model_ckpt = params.model_dir / params.checkpoint_name
+        model_config = params.model_dir / "model.json"
+        token_file = params.model_dir / "tokens.txt"
+        logging.info(
+            f"Using local model dir {params.model_dir}, "
+            f"checkpoint {params.checkpoint_name}"
+        )
+    else:
+        logging.info("Using pretrained model from the huggingface")
+        logging.info("Downloading the requires files from HuggingFace")
+        model_ckpt = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/model.pt"
+        )
+        model_config = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/model.json"
+        )
+        token_file = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/tokens.txt"
+        )
+    logging.info("Loading model...")
+    tokenizer = DialogTokenizer(token_file=token_file)
+    tokenizer_config = {
+        "vocab_size": tokenizer.vocab_size,
+        "pad_id": tokenizer.pad_id,
+        "spk_a_id": tokenizer.spk_a_id,
+        "spk_b_id": tokenizer.spk_b_id,
+    }
+    with open(model_config, "r") as f:
+        model_config = json.load(f)
+    if params.model_name == "zipvoice_dialog":
+        model = ZipVoiceDialog(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    else:
+        assert params.model_name == "zipvoice_dialog_stereo"
+        model = ZipVoiceDialogStereo(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+    if str(model_ckpt).endswith(".safetensors"):
+        safetensors.torch.load_model(model, model_ckpt)
+    elif str(model_ckpt).endswith(".pt"):
+        load_checkpoint(filename=model_ckpt, model=model, strict=True)
+    else:
+        raise NotImplementedError(f"Unsupported model checkpoint format: {model_ckpt}")
+    if torch.cuda.is_available():
+        params.device = torch.device("cuda", 0)
+    elif torch.backends.mps.is_available():
+        params.device = torch.device("mps")
+    else:
+        params.device = torch.device("cpu")
+    logging.info(f"Device: {params.device}")
+    model = model.to(params.device)
+    model.eval()
+    vocoder = get_vocoder(params.vocoder_path)
+    vocoder = vocoder.to(params.device)
+    vocoder.eval()
+    if model_config["feature"]["type"] == "vocos":
+        if params.model_name == "zipvoice_dialog":
+            num_channels = 1
+        else:
+            assert params.model_name == "zipvoice_dialog_stereo"
+            num_channels = 2
+        feature_extractor = VocosFbank(num_channels=num_channels)
+    else:
+        raise NotImplementedError(
+            f"Unsupported feature type: {model_config['feature']['type']}"
+        )
+    params.sampling_rate = model_config["feature"]["sampling_rate"]
+    logging.info("Start generating...")
+    os.makedirs(params.res_dir, exist_ok=True)
+    generate_list(
+        model_name=params.model_name,
+        res_dir=params.res_dir,
+        test_list=params.test_list,
+        model=model,
+        vocoder=vocoder,
+        tokenizer=tokenizer,
+        feature_extractor=feature_extractor,
+        device=params.device,
+        num_step=params.num_step,
+        guidance_scale=params.guidance_scale,
+        speed=params.speed,
+        t_shift=params.t_shift,
+        target_rms=params.target_rms,
+        feat_scale=params.feat_scale,
+        sampling_rate=params.sampling_rate,
+        silence_wav=params.silence_wav,
+    )
+    logging.info("Done")
+if __name__ == "__main__":
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    main()

zipvoice/bin/infer_zipvoice_onnx.py ADDED Viewed

	@@ -0,0 +1,712 @@

+# Copyright         2025  Xiaomi Corp.        (authors: Han Zhu,
+#                                                       Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script generates speech with our pre-trained ZipVoice or ZipVoice-Distill
+    ONNX models. If no local model is specified,
+    Required files will be automatically downloaded from HuggingFace.
+Usage:
+Note: If you having trouble connecting to HuggingFace,
+    try switching endpoint to mirror site:
+export HF_ENDPOINT=https://hf-mirror.com
+(1) Inference of a single sentence:
+python3 -m zipvoice.bin.infer_zipvoice_onnx \
+    --onnx-int8 False \
+    --model-name zipvoice \
+    --prompt-wav prompt.wav \
+    --prompt-text "I am a prompt." \
+    --text "I am a sentence." \
+    --res-wav-path result.wav
+(2) Inference of a list of sentences:
+python3 -m zipvoice.bin.infer_zipvoice_onnx \
+    --onnx-int8 False \
+    --model-name zipvoice \
+    --test-list test.tsv \
+    --res-dir results
+`--model-name` can be `zipvoice` or `zipvoice_distill`,
+    which are the models before and after distillation, respectively.
+Each line of `test.tsv` is in the format of
+    `{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}`.
+Set `--onnx-int8 True` to use int8 quantizated ONNX model.
+"""
+import argparse
+import datetime as dt
+import json
+import logging
+import os
+from pathlib import Path
+from typing import List, Tuple
+import numpy as np
+import onnxruntime as ort
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from lhotse.utils import fix_random_seed
+from torch import Tensor, nn
+from zipvoice.bin.infer_zipvoice import get_vocoder
+from zipvoice.models.modules.solver import get_time_steps
+from zipvoice.tokenizer.tokenizer import (
+    EmiliaTokenizer,
+    EspeakTokenizer,
+    LibriTTSTokenizer,
+    SimpleTokenizer,
+)
+from zipvoice.utils.common import AttributeDict, str2bool
+from zipvoice.utils.feature import VocosFbank
+HUGGINGFACE_REPO = "k2-fsa/ZipVoice"
+MODEL_DIR = {
+    "zipvoice": "zipvoice",
+    "zipvoice_distill": "zipvoice_distill",
+}
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--onnx-int8",
+        type=str2bool,
+        default=False,
+        help="Whether to use the int8 model",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="zipvoice",
+        choices=["zipvoice", "zipvoice_distill"],
+        help="The model used for inference",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="The path to the local onnx model. "
+        "Will download pre-trained checkpoint from huggingface if not specified.",
+    )
+    parser.add_argument(
+        "--vocoder-path",
+        type=str,
+        default=None,
+        help="The vocoder checkpoint. "
+        "Will download pre-trained vocoder from huggingface if not specified.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="emilia",
+        choices=["emilia", "libritts", "espeak", "simple"],
+        help="Tokenizer type.",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default="en-us",
+        help="Language identifier, used when tokenizer type is espeak. see"
+        "https://github.com/rhasspy/espeak-ng/blob/master/docs/languages.md",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default=None,
+        help="The list of prompt speech, prompt_transcription, "
+        "and text to synthesizein the format of "
+        "'{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}'.",
+    )
+    parser.add_argument(
+        "--prompt-wav",
+        type=str,
+        default=None,
+        help="The prompt wav to mimic",
+    )
+    parser.add_argument(
+        "--prompt-text",
+        type=str,
+        default=None,
+        help="The transcription of the prompt wav",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        default=None,
+        help="The text to synthesize",
+    )
+    parser.add_argument(
+        "--res-dir",
+        type=str,
+        default="results",
+        help="""
+        Path name of the generated wavs dir,
+        used when test-list is not None
+        """,
+    )
+    parser.add_argument(
+        "--res-wav-path",
+        type=str,
+        default="result.wav",
+        help="""
+        Path name of the generated wav path,
+        used when test-list is None
+        """,
+    )
+    parser.add_argument(
+        "--guidance-scale",
+        type=float,
+        default=None,
+        help="The scale of classifier-free guidance during inference.",
+    )
+    parser.add_argument(
+        "--num-step",
+        type=int,
+        default=None,
+        help="The number of sampling steps.",
+    )
+    parser.add_argument(
+        "--feat-scale",
+        type=float,
+        default=0.1,
+        help="The scale factor of fbank feature",
+    )
+    parser.add_argument(
+        "--speed",
+        type=float,
+        default=1.0,
+        help="Control speech speed, 1.0 means normal, >1.0 means speed up",
+    )
+    parser.add_argument(
+        "--t-shift",
+        type=float,
+        default=0.5,
+        help="Shift t to smaller ones if t_shift < 1.0",
+    )
+    parser.add_argument(
+        "--target-rms",
+        type=float,
+        default=0.1,
+        help="Target speech normalization rms value, set to 0 to disable normalization",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=666,
+        help="Random seed",
+    )
+    return parser
+class OnnxModel:
+    def __init__(
+        self,
+        text_encoder_path: str,
+        fm_decoder_path: str,
+    ):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 1
+        self.session_opts = session_opts
+        self.init_text_encoder(text_encoder_path)
+        self.init_fm_decoder(fm_decoder_path)
+    def init_text_encoder(self, model_path: str):
+        self.text_encoder = ort.InferenceSession(
+            model_path,
+            sess_options=self.session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+    def init_fm_decoder(self, model_path: str):
+        self.fm_decoder = ort.InferenceSession(
+            model_path,
+            sess_options=self.session_opts,
+            providers=["CPUExecutionProvider"],
+        )
+        meta = self.fm_decoder.get_modelmeta().custom_metadata_map
+        self.feat_dim = int(meta["feat_dim"])
+    def run_text_encoder(
+        self,
+        tokens: Tensor,
+        prompt_tokens: Tensor,
+        prompt_features_len: Tensor,
+        speed: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        out = self.text_encoder.run(
+            [
+                self.text_encoder.get_outputs()[0].name,
+            ],
+            {
+                self.text_encoder.get_inputs()[0].name: tokens.numpy(),
+                self.text_encoder.get_inputs()[1].name: prompt_tokens.numpy(),
+                self.text_encoder.get_inputs()[2].name: prompt_features_len.numpy(),
+                self.text_encoder.get_inputs()[3].name: speed.numpy(),
+            },
+        )
+        return torch.from_numpy(out[0])
+    def run_fm_decoder(
+        self,
+        t: Tensor,
+        x: Tensor,
+        text_condition: Tensor,
+        speech_condition: torch.Tensor,
+        guidance_scale: Tensor,
+    ) -> Tensor:
+        out = self.fm_decoder.run(
+            [
+                self.fm_decoder.get_outputs()[0].name,
+            ],
+            {
+                self.fm_decoder.get_inputs()[0].name: t.numpy(),
+                self.fm_decoder.get_inputs()[1].name: x.numpy(),
+                self.fm_decoder.get_inputs()[2].name: text_condition.numpy(),
+                self.fm_decoder.get_inputs()[3].name: speech_condition.numpy(),
+                self.fm_decoder.get_inputs()[4].name: guidance_scale.numpy(),
+            },
+        )
+        return torch.from_numpy(out[0])
+def sample(
+    model: OnnxModel,
+    tokens: List[List[int]],
+    prompt_tokens: List[List[int]],
+    prompt_features: Tensor,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    guidance_scale: float = 1.0,
+    num_step: int = 16,
+) -> torch.Tensor:
+    """
+    Generate acoustic features, given text tokens, prompts feature and prompt
+    transcription's text tokens.
+    Args:
+        tokens: a list of list of text tokens.
+        prompt_tokens: a list of list of prompt tokens.
+        prompt_features: the prompt feature with the shape
+            (batch_size, seq_len, feat_dim).
+        speed : speed control.
+        t_shift: time shift.
+        guidance_scale: the guidance scale for classifier-free guidance.
+        num_step: the number of steps to use in the ODE solver.
+    """
+    # Run text encoder
+    assert len(tokens) == len(prompt_tokens) == 1
+    tokens = torch.tensor(tokens, dtype=torch.int64)
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.int64)
+    prompt_features_len = torch.tensor(prompt_features.size(1), dtype=torch.int64)
+    speed = torch.tensor(speed, dtype=torch.float32)
+    text_condition = model.run_text_encoder(
+        tokens, prompt_tokens, prompt_features_len, speed
+    )
+    batch_size, num_frames, _ = text_condition.shape
+    assert batch_size == 1
+    feat_dim = model.feat_dim
+    # Run flow matching model
+    timesteps = get_time_steps(
+        t_start=0.0,
+        t_end=1.0,
+        num_step=num_step,
+        t_shift=t_shift,
+    )
+    x = torch.randn(batch_size, num_frames, feat_dim)
+    speech_condition = torch.nn.functional.pad(
+        prompt_features, (0, 0, 0, num_frames - prompt_features.shape[1])
+    )  # (B, T, F)
+    guidance_scale = torch.tensor(guidance_scale, dtype=torch.float32)
+    for step in range(num_step):
+        v = model.run_fm_decoder(
+            t=timesteps[step],
+            x=x,
+            text_condition=text_condition,
+            speech_condition=speech_condition,
+            guidance_scale=guidance_scale,
+        )
+        x = x + v * (timesteps[step + 1] - timesteps[step])
+    x = x[:, prompt_features_len.item() :, :]
+    return x
+# Copied from zipvoice/bin/infer_zipvoice.py, but call an external sample function
+def generate_sentence(
+    save_path: str,
+    prompt_text: str,
+    prompt_wav: str,
+    text: str,
+    model: OnnxModel,
+    vocoder: nn.Module,
+    tokenizer: EmiliaTokenizer,
+    feature_extractor: VocosFbank,
+    num_step: int = 16,
+    guidance_scale: float = 1.0,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    target_rms: float = 0.1,
+    feat_scale: float = 0.1,
+    sampling_rate: int = 24000,
+):
+    """
+    Generate waveform of a text based on a given prompt
+        waveform and its transcription.
+    Args:
+        save_path (str): Path to save the generated wav.
+        prompt_text (str): Transcription of the prompt wav.
+        prompt_wav (str): Path to the prompt wav file.
+        text (str): Text to be synthesized into a waveform.
+        model (torch.nn.Module): The model used for generation.
+        vocoder (torch.nn.Module): The vocoder used to convert features to waveforms.
+        tokenizer (EmiliaTokenizer): The tokenizer used to convert text to tokens.
+        feature_extractor (VocosFbank): The feature extractor used to
+            extract acoustic features.
+        num_step (int, optional): Number of steps for decoding. Defaults to 16.
+        guidance_scale (float, optional): Scale for classifier-free guidance.
+            Defaults to 1.0.
+        speed (float, optional): Speed control. Defaults to 1.0.
+        t_shift (float, optional): Time shift. Defaults to 0.5.
+        target_rms (float, optional): Target RMS for waveform normalization.
+            Defaults to 0.1.
+        feat_scale (float, optional): Scale for features.
+            Defaults to 0.1.
+        sampling_rate (int, optional): Sampling rate for the waveform.
+            Defaults to 24000.
+    Returns:
+        metrics (dict): Dictionary containing time and real-time
+            factor metrics for processing.
+    """
+    # Convert text to tokens
+    tokens = tokenizer.texts_to_token_ids([text])
+    prompt_tokens = tokenizer.texts_to_token_ids([prompt_text])
+    # Load and preprocess prompt wav
+    prompt_wav, prompt_sampling_rate = torchaudio.load(prompt_wav)
+    if prompt_sampling_rate != sampling_rate:
+        resampler = torchaudio.transforms.Resample(
+            orig_freq=prompt_sampling_rate, new_freq=sampling_rate
+        )
+        prompt_wav = resampler(prompt_wav)
+    prompt_rms = torch.sqrt(torch.mean(torch.square(prompt_wav)))
+    if prompt_rms < target_rms:
+        prompt_wav = prompt_wav * target_rms / prompt_rms
+    # Extract features from prompt wav
+    prompt_features = feature_extractor.extract(prompt_wav, sampling_rate=sampling_rate)
+    prompt_features = prompt_features.unsqueeze(0) * feat_scale
+    # Start timing
+    start_t = dt.datetime.now()
+    # Generate features
+    pred_features = sample(
+        model=model,
+        tokens=tokens,
+        prompt_tokens=prompt_tokens,
+        prompt_features=prompt_features,
+        speed=speed,
+        t_shift=t_shift,
+        guidance_scale=guidance_scale,
+        num_step=num_step,
+    )
+    # Postprocess predicted features
+    pred_features = pred_features.permute(0, 2, 1) / feat_scale  # (B, C, T)
+    # Start vocoder processing
+    start_vocoder_t = dt.datetime.now()
+    wav = vocoder.decode(pred_features).squeeze(1).clamp(-1, 1)
+    # Calculate processing times and real-time factors
+    t = (dt.datetime.now() - start_t).total_seconds()
+    t_no_vocoder = (start_vocoder_t - start_t).total_seconds()
+    t_vocoder = (dt.datetime.now() - start_vocoder_t).total_seconds()
+    wav_seconds = wav.shape[-1] / sampling_rate
+    rtf = t / wav_seconds
+    rtf_no_vocoder = t_no_vocoder / wav_seconds
+    rtf_vocoder = t_vocoder / wav_seconds
+    metrics = {
+        "t": t,
+        "t_no_vocoder": t_no_vocoder,
+        "t_vocoder": t_vocoder,
+        "wav_seconds": wav_seconds,
+        "rtf": rtf,
+        "rtf_no_vocoder": rtf_no_vocoder,
+        "rtf_vocoder": rtf_vocoder,
+    }
+    # Adjust wav volume if necessary
+    if prompt_rms < target_rms:
+        wav = wav * prompt_rms / target_rms
+    torchaudio.save(save_path, wav.cpu(), sample_rate=sampling_rate)
+    return metrics
+def generate_list(
+    res_dir: str,
+    test_list: str,
+    model: OnnxModel,
+    vocoder: nn.Module,
+    tokenizer: EmiliaTokenizer,
+    feature_extractor: VocosFbank,
+    num_step: int = 16,
+    guidance_scale: float = 1.0,
+    speed: float = 1.0,
+    t_shift: float = 0.5,
+    target_rms: float = 0.1,
+    feat_scale: float = 0.1,
+    sampling_rate: int = 24000,
+):
+    total_t = []
+    total_t_no_vocoder = []
+    total_t_vocoder = []
+    total_wav_seconds = []
+    with open(test_list, "r") as fr:
+        lines = fr.readlines()
+    for i, line in enumerate(lines):
+        wav_name, prompt_text, prompt_wav, text = line.strip().split("\t")
+        save_path = f"{res_dir}/{wav_name}.wav"
+        metrics = generate_sentence(
+            save_path=save_path,
+            prompt_text=prompt_text,
+            prompt_wav=prompt_wav,
+            text=text,
+            model=model,
+            vocoder=vocoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            num_step=num_step,
+            guidance_scale=guidance_scale,
+            speed=speed,
+            t_shift=t_shift,
+            target_rms=target_rms,
+            feat_scale=feat_scale,
+            sampling_rate=sampling_rate,
+        )
+        logging.info(f"[Sentence: {i}] RTF: {metrics['rtf']:.4f}")
+        total_t.append(metrics["t"])
+        total_t_no_vocoder.append(metrics["t_no_vocoder"])
+        total_t_vocoder.append(metrics["t_vocoder"])
+        total_wav_seconds.append(metrics["wav_seconds"])
+    logging.info(f"Average RTF: {np.sum(total_t) / np.sum(total_wav_seconds):.4f}")
+    logging.info(
+        f"Average RTF w/o vocoder: "
+        f"{np.sum(total_t_no_vocoder) / np.sum(total_wav_seconds):.4f}"
+    )
+    logging.info(
+        f"Average RTF vocoder: "
+        f"{np.sum(total_t_vocoder) / np.sum(total_wav_seconds):.4f}"
+    )
+@torch.inference_mode()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    params = AttributeDict()
+    params.update(vars(args))
+    fix_random_seed(params.seed)
+    model_defaults = {
+        "zipvoice": {
+            "num_step": 16,
+            "guidance_scale": 1.0,
+        },
+        "zipvoice_distill": {
+            "num_step": 8,
+            "guidance_scale": 3.0,
+        },
+    }
+    model_specific_defaults = model_defaults.get(params.model_name, {})
+    for param, value in model_specific_defaults.items():
+        if getattr(params, param) is None:
+            setattr(params, param, value)
+            logging.info(f"Setting {param} to default value: {value}")
+    assert (params.test_list is not None) ^ (
+        (params.prompt_wav and params.prompt_text and params.text) is not None
+    ), (
+        "For inference, please provide prompts and text with either '--test-list'"
+        " or '--prompt-wav, --prompt-text and --text'."
+    )
+    if params.onnx_int8:
+        text_encoder_name = "text_encoder_int8.onnx"
+        fm_decoder_name = "fm_decoder_int8.onnx"
+    else:
+        text_encoder_name = "text_encoder.onnx"
+        fm_decoder_name = "fm_decoder.onnx"
+    if params.model_dir is not None:
+        params.model_dir = Path(params.model_dir)
+        if not params.model_dir.is_dir():
+            raise FileNotFoundError(f"{params.model_dir} does not exist")
+        for filename in [
+            text_encoder_name,
+            fm_decoder_name,
+            "model.json",
+            "tokens.txt",
+        ]:
+            if not (params.model_dir / filename).is_file():
+                raise FileNotFoundError(f"{params.model_dir / filename} does not exist")
+        text_encoder_path = params.model_dir / text_encoder_name
+        fm_decoder_path = params.model_dir / fm_decoder_name
+        model_config = params.model_dir / "model.json"
+        token_file = params.model_dir / "tokens.txt"
+        logging.info(f"Using local model dir {params.model_dir}.")
+    else:
+        logging.info("Using pretrained model from the huggingface")
+        logging.info("Downloading the requires files from HuggingFace")
+        text_encoder_path = hf_hub_download(
+            HUGGINGFACE_REPO,
+            filename=f"{MODEL_DIR[params.model_name]}/{text_encoder_name}",
+        )
+        fm_decoder_path = hf_hub_download(
+            HUGGINGFACE_REPO,
+            filename=f"{MODEL_DIR[params.model_name]}/{fm_decoder_name}",
+        )
+        model_config = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/model.json"
+        )
+        token_file = hf_hub_download(
+            HUGGINGFACE_REPO, filename=f"{MODEL_DIR[params.model_name]}/tokens.txt"
+        )
+    logging.info("Loading model...")
+    if params.tokenizer == "emilia":
+        tokenizer = EmiliaTokenizer(token_file=token_file)
+    elif params.tokenizer == "libritts":
+        tokenizer = LibriTTSTokenizer(token_file=token_file)
+    elif params.tokenizer == "espeak":
+        tokenizer = EspeakTokenizer(token_file=token_file, lang=params.lang)
+    else:
+        assert params.tokenizer == "simple"
+        tokenizer = SimpleTokenizer(token_file=token_file)
+    with open(model_config, "r") as f:
+        model_config = json.load(f)
+    model = OnnxModel(text_encoder_path, fm_decoder_path)
+    vocoder = get_vocoder(params.vocoder_path)
+    vocoder.eval()
+    if model_config["feature"]["type"] == "vocos":
+        feature_extractor = VocosFbank()
+    else:
+        raise NotImplementedError(
+            f"Unsupported feature type: {model_config['feature']['type']}"
+        )
+    params.sampling_rate = model_config["feature"]["sampling_rate"]
+    logging.info("Start generating...")
+    if params.test_list:
+        os.makedirs(params.res_dir, exist_ok=True)
+        generate_list(
+            res_dir=params.res_dir,
+            test_list=params.test_list,
+            model=model,
+            vocoder=vocoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            num_step=params.num_step,
+            guidance_scale=params.guidance_scale,
+            speed=params.speed,
+            t_shift=params.t_shift,
+            target_rms=params.target_rms,
+            feat_scale=params.feat_scale,
+            sampling_rate=params.sampling_rate,
+        )
+    else:
+        generate_sentence(
+            save_path=params.res_wav_path,
+            prompt_text=params.prompt_text,
+            prompt_wav=params.prompt_wav,
+            text=params.text,
+            model=model,
+            vocoder=vocoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            num_step=params.num_step,
+            guidance_scale=params.guidance_scale,
+            speed=params.speed,
+            t_shift=params.t_shift,
+            target_rms=params.target_rms,
+            feat_scale=params.feat_scale,
+            sampling_rate=params.sampling_rate,
+        )
+    logging.info("Done")
+if __name__ == "__main__":
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    main()

zipvoice/bin/onnx_export.py ADDED Viewed

	@@ -0,0 +1,410 @@

+#!/usr/bin/env python3
+# Copyright         2025  Xiaomi Corp.        (authors: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script exports a pre-trained ZipVoice or ZipVoice-Distill model from PyTorch to
+ONNX.
+Usage:
+python3 -m zipvoice.bin.onnx_export \
+    --model-name zipvoice \
+    --model-dir exp/zipvoice \
+    --checkpoint-name epoch-11-avg-4.pt \
+    --onnx-model-dir exp/zipvoice
+`--model-name` can be `zipvoice` or `zipvoice_distill`,
+    which are the models before and after distillation, respectively.
+"""
+import argparse
+import json
+import logging
+from pathlib import Path
+from typing import Dict
+import onnx
+import safetensors.torch
+import torch
+from onnxruntime.quantization import QuantType, quantize_dynamic
+from torch import Tensor, nn
+from zipvoice.models.zipvoice import ZipVoice
+from zipvoice.models.zipvoice_distill import ZipVoiceDistill
+from zipvoice.tokenizer.tokenizer import SimpleTokenizer
+from zipvoice.utils.checkpoint import load_checkpoint
+from zipvoice.utils.common import AttributeDict
+from zipvoice.utils.scaling_converter import convert_scaled_to_non_scaled
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--onnx-model-dir",
+        type=str,
+        default="exp",
+        help="Dir to the exported models",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="zipvoice",
+        choices=["zipvoice", "zipvoice_distill"],
+        help="The model used for inference",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="The model directory that contains model checkpoint, configuration "
+        "file model.json, and tokens file tokens.txt. Will download pre-trained "
+        "checkpoint from huggingface if not specified.",
+    )
+    parser.add_argument(
+        "--checkpoint-name",
+        type=str,
+        default="model.pt",
+        help="The name of model checkpoint.",
+    )
+    return parser
+def add_meta_data(filename: str, meta_data: Dict[str, str]):
+    """Add meta data to an ONNX model. It is changed in-place.
+    Args:
+      filename:
+        Filename of the ONNX model to be changed.
+      meta_data:
+        Key-value pairs.
+    """
+    model = onnx.load(filename)
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = value
+    onnx.save(model, filename)
+class OnnxTextModel(nn.Module):
+    def __init__(self, model: nn.Module):
+        """A wrapper for ZipVoice text encoder."""
+        super().__init__()
+        self.embed = model.embed
+        self.text_encoder = model.text_encoder
+        self.pad_id = model.pad_id
+    def forward(
+        self,
+        tokens: Tensor,
+        prompt_tokens: Tensor,
+        prompt_features_len: Tensor,
+        speed: Tensor,
+    ) -> Tensor:
+        cat_tokens = torch.cat([prompt_tokens, tokens], dim=1)
+        cat_tokens = nn.functional.pad(cat_tokens, (0, 1), value=self.pad_id)
+        tokens_len = cat_tokens.shape[1] - 1
+        padding_mask = (torch.arange(tokens_len + 1) == tokens_len).unsqueeze(0)
+        embed = self.embed(cat_tokens)
+        embed = self.text_encoder(x=embed, t=None, padding_mask=padding_mask)
+        features_len = torch.ceil(
+            (prompt_features_len / prompt_tokens.shape[1] * tokens_len / speed)
+        ).to(dtype=torch.int64)
+        token_dur = torch.div(features_len, tokens_len, rounding_mode="floor").to(
+            dtype=torch.int64
+        )
+        text_condition = embed[:, :-1, :].unsqueeze(2).expand(-1, -1, token_dur, -1)
+        text_condition = text_condition.reshape(embed.shape[0], -1, embed.shape[2])
+        text_condition = torch.cat(
+            [
+                text_condition,
+                embed[:, -1:, :].expand(-1, features_len - text_condition.shape[1], -1),
+            ],
+            dim=1,
+        )
+        return text_condition
+class OnnxFlowMatchingModel(nn.Module):
+    def __init__(self, model: nn.Module, distill: bool = False):
+        """A wrapper for ZipVoice flow-matching decoder."""
+        super().__init__()
+        self.distill = distill
+        self.fm_decoder = model.fm_decoder
+        self.model_func = getattr(model, "forward_fm_decoder")
+        self.feat_dim = model.feat_dim
+    def forward(
+        self,
+        t: Tensor,
+        x: Tensor,
+        text_condition: Tensor,
+        speech_condition: torch.Tensor,
+        guidance_scale: Tensor,
+    ) -> Tensor:
+        if self.distill:
+            return self.model_func(
+                t=t,
+                xt=x,
+                text_condition=text_condition,
+                speech_condition=speech_condition,
+                guidance_scale=guidance_scale,
+            )
+        else:
+            x = x.repeat(2, 1, 1)
+            text_condition = torch.cat(
+                [torch.zeros_like(text_condition), text_condition], dim=0
+            )
+            speech_condition = torch.cat(
+                [
+                    torch.where(
+                        t > 0.5, torch.zeros_like(speech_condition), speech_condition
+                    ),
+                    speech_condition,
+                ],
+                dim=0,
+            )
+            guidance_scale = torch.where(t > 0.5, guidance_scale, guidance_scale * 2.0)
+            data_uncond, data_cond = self.model_func(
+                t=t,
+                xt=x,
+                text_condition=text_condition,
+                speech_condition=speech_condition,
+            ).chunk(2, dim=0)
+            v = (1 + guidance_scale) * data_cond - guidance_scale * data_uncond
+            return v
+def export_text_encoder(
+    model: OnnxTextModel,
+    filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the text encoder model to ONNX format.
+    Args:
+      model:
+        The input model
+      filename:
+        The filename to save the exported ONNX model.
+      opset_version:
+        The opset version to use.
+    """
+    tokens = torch.tensor([[2, 3, 4, 5]], dtype=torch.int64)
+    prompt_tokens = torch.tensor([[0, 1]], dtype=torch.int64)
+    prompt_features_len = torch.tensor(10, dtype=torch.int64)
+    speed = torch.tensor(1.0, dtype=torch.float32)
+    model = torch.jit.trace(model, (tokens, prompt_tokens, prompt_features_len, speed))
+    torch.onnx.export(
+        model,
+        (tokens, prompt_tokens, prompt_features_len, speed),
+        filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=["tokens", "prompt_tokens", "prompt_features_len", "speed"],
+        output_names=["text_condition"],
+        dynamic_axes={
+            "tokens": {0: "N", 1: "T"},
+            "prompt_tokens": {0: "N", 1: "T"},
+            "text_condition": {0: "N", 1: "T"},
+        },
+    )
+    meta_data = {
+        "version": "1",
+        "model_author": "k2-fsa",
+        "comment": "ZipVoice text encoder",
+    }
+    logging.info(f"meta_data: {meta_data}")
+    add_meta_data(filename=filename, meta_data=meta_data)
+    logging.info(f"Exported to {filename}")
+def export_fm_decoder(
+    model: OnnxFlowMatchingModel,
+    filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the flow matching decoder model to ONNX format.
+    Args:
+      model:
+        The input model
+      filename:
+        The filename to save the exported ONNX model.
+      opset_version:
+        The opset version to use.
+    """
+    feat_dim = model.feat_dim
+    seq_len = 200
+    t = torch.tensor(0.5, dtype=torch.float32)
+    x = torch.randn(1, seq_len, feat_dim, dtype=torch.float32)
+    text_condition = torch.randn(1, seq_len, feat_dim, dtype=torch.float32)
+    speech_condition = torch.randn(1, seq_len, feat_dim, dtype=torch.float32)
+    guidance_scale = torch.tensor(1.0, dtype=torch.float32)
+    model = torch.jit.trace(
+        model, (t, x, text_condition, speech_condition, guidance_scale)
+    )
+    torch.onnx.export(
+        model,
+        (t, x, text_condition, speech_condition, guidance_scale),
+        filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=["t", "x", "text_condition", "speech_condition", "guidance_scale"],
+        output_names=["v"],
+        dynamic_axes={
+            "x": {0: "N", 1: "T"},
+            "text_condition": {0: "N", 1: "T"},
+            "speech_condition": {0: "N", 1: "T"},
+            "v": {0: "N", 1: "T"},
+        },
+    )
+    meta_data = {
+        "version": "1",
+        "model_author": "k2-fsa",
+        "comment": "ZipVoice flow-matching decoder",
+        "feat_dim": str(feat_dim),
+    }
+    logging.info(f"meta_data: {meta_data}")
+    add_meta_data(filename=filename, meta_data=meta_data)
+    logging.info(f"Exported to {filename}")
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    params = AttributeDict()
+    params.update(vars(args))
+    params.model_dir = Path(params.model_dir)
+    if not params.model_dir.is_dir():
+        raise FileNotFoundError(f"{params.model_dir} does not exist")
+    for filename in [params.checkpoint_name, "model.json", "tokens.txt"]:
+        if not (params.model_dir / filename).is_file():
+            raise FileNotFoundError(f"{params.model_dir / filename} does not exist")
+    model_ckpt = params.model_dir / params.checkpoint_name
+    model_config = params.model_dir / "model.json"
+    token_file = params.model_dir / "tokens.txt"
+    logging.info(f"Loading model from {params.model_dir}")
+    tokenizer = SimpleTokenizer(token_file)
+    tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
+    with open(model_config, "r") as f:
+        model_config = json.load(f)
+    if params.model_name == "zipvoice":
+        model = ZipVoice(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+        distill = False
+    else:
+        assert params.model_name == "zipvoice_distill"
+        model = ZipVoiceDistill(
+            **model_config["model"],
+            **tokenizer_config,
+        )
+        distill = True
+    if str(model_ckpt).endswith(".safetensors"):
+        safetensors.torch.load_model(model, model_ckpt)
+    elif str(model_ckpt).endswith(".pt"):
+        load_checkpoint(filename=model_ckpt, model=model, strict=True)
+    else:
+        raise NotImplementedError(f"Unsupported model checkpoint format: {model_ckpt}")
+    device = torch.device("cpu")
+    model = model.to(device)
+    model.eval()
+    convert_scaled_to_non_scaled(model, inplace=True, is_onnx=True)
+    logging.info("Exporting model")
+    onnx_model_dir = Path(params.onnx_model_dir)
+    onnx_model_dir.mkdir(parents=True, exist_ok=True)
+    opset_version = 11
+    text_encoder = OnnxTextModel(model=model)
+    text_encoder_file = onnx_model_dir / "text_encoder.onnx"
+    export_text_encoder(
+        model=text_encoder,
+        filename=text_encoder_file,
+        opset_version=opset_version,
+    )
+    fm_decoder = OnnxFlowMatchingModel(model=model, distill=distill)
+    fm_decoder_file = onnx_model_dir / "fm_decoder.onnx"
+    export_fm_decoder(
+        model=fm_decoder,
+        filename=fm_decoder_file,
+        opset_version=opset_version,
+    )
+    logging.info("Generate int8 quantization models")
+    text_encoder_int8_file = onnx_model_dir / "text_encoder_int8.onnx"
+    quantize_dynamic(
+        model_input=text_encoder_file,
+        model_output=text_encoder_int8_file,
+        op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QInt8,
+    )
+    fm_decoder_int8_file = onnx_model_dir / "fm_decoder_int8.onnx"
+    quantize_dynamic(
+        model_input=fm_decoder_file,
+        model_output=fm_decoder_int8_file,
+        op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QInt8,
+    )
+    logging.info("Done!")
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    main()

zipvoice/bin/prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python3
+# Copyright         2025  Xiaomi Corp.        (authors: Han Zhu)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script generates lhotse manifest files from TSV files for custom datasets.
+Each line of the TSV files should be in one of the following formats:
+1. "{uniq_id}\t{text}\t{wav_path}" if the text corresponds to the full wav",
+2. "{uniq_id}\t{text}\t{wav_path}\t{start_time}\t{end_time} if text corresponds
+    to part of the wav. The start_time and end_time specify the start and end
+    times of the text within the wav, which should be in seconds.
+Note: {uniq_id} must be unique for each line.
+Usage:
+Suppose you have two TSV files: "custom_train.tsv" and "custom_dev.tsv",
+where "custom" is your dataset name, "train"/"dev" are used for training and
+validation respectively.
+(1) Prepare the training data
+python3 -m zipvoice.bin.prepare_dataset \
+    --tsv-path data/raw/custom_train.tsv \
+    --prefix "custom" \
+    --subset "train" \
+    --num-jobs 20 \
+    --output-dir "data/manifests"
+The output file would be "data/manifests/custom_cuts_train.jsonl.gz".
+(2) Prepare the validation data
+python3 -m zipvoice.bin.prepare_dataset \
+    --tsv-path data/raw/custom_dev.tsv \
+    --prefix "custom" \
+    --subset "dev" \
+    --num-jobs 1 \
+    --output-dir "data/manifests"
+The output file would be "data/manifests/custom_cuts_dev.jsonl.gz".
+"""
+import argparse
+import logging
+import re
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import List, Optional, Tuple
+from lhotse import CutSet, validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.qa import fix_manifests
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike
+from tqdm.auto import tqdm
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tsv-path",
+        type=str,
+        help="The path of the tsv file. Each line should be in the format: "
+        "{uniq_id}\t{text}\t{wav_path}\t{start_time}\t{end_time} "
+        "if text corresponds to part of the wav or {uniq_id}\t{text}\t{wav_path} "
+        "if the text corresponds to the full wav",
+    )
+    parser.add_argument(
+        "--prefix",
+        type=str,
+        default="custom",
+        help="Prefix of the output manifest file.",
+    )
+    parser.add_argument(
+        "--subset",
+        type=str,
+        default="train",
+        help="Subset name manifest file, typically train or dev.",
+    )
+    parser.add_argument(
+        "--num-jobs",
+        type=int,
+        default=20,
+        help="Number of jobs to processing.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/manifests",
+        help="The destination directory of manifest files.",
+    )
+    parser.add_argument(
+        "--sampling-rate",
+        type=int,
+        default=24000,
+        help="The target sampling rate.",
+    )
+    return parser.parse_args()
+def _parse_recording(
+    wav_path: str,
+) -> Tuple[Recording, str]:
+    """
+    :param wav_path: Path to the audio file
+    :return: a tuple of "recording" and "recording_id"
+    """
+    recording_id = wav_path.replace("/", "_").replace(".", "_")
+    recording = Recording.from_file(path=wav_path, recording_id=recording_id)
+    return recording, recording_id
+def _parse_supervision(
+    supervision: List, recording_dict: dict
+) -> Optional[SupervisionSegment]:
+    """
+    :param line: A line from the TSV file
+    :param recording_dict: Dictionary mapping recording IDs to Recording objects
+    :return: A SupervisionSegment object
+    """
+    uniq_id, text, wav_path, start, end = supervision
+    try:
+        recording_id = wav_path.replace("/", "_").replace(".", "_")
+        recording = recording_dict[recording_id]
+        duration = end - start if end is not None else recording.duration
+        assert duration <= recording.duration, f"Duration {duration} is greater than "
+        f"recording duration {recording.duration}"
+        text = re.sub("_", " ", text)  # "_" is treated as padding symbol
+        text = re.sub(r"\s+", " ", text)  # remove extra whitespace
+        return SupervisionSegment(
+            id=f"{uniq_id}",
+            recording_id=recording.id,
+            start=start,
+            duration=duration,
+            channel=recording.channel_ids,
+            text=text.strip(),
+        )
+    except Exception as e:
+        logging.warning(f"Error processing line: {e}")
+        return None
+def prepare_dataset(
+    tsv_path: Pathlike,
+    prefix: str,
+    subset: str,
+    sampling_rate: int,
+    num_jobs: int,
+    output_dir: Pathlike,
+):
+    """
+    Returns the manifests which consist of the Recordings and Supervisions
+    :param tsv_path: Path to the TSV file
+    :param output_dir: Path where to write the manifests
+    :param num_jobs: Number of processes for parallel processing
+    :return: The CutSet containing the data
+    """
+    logging.info(f"Preparing {prefix} dataset {subset} subset.")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    file_name = f"{prefix}_cuts_{subset}.jsonl.gz"
+    if (output_dir / file_name).is_file():
+        logging.info(f"{file_name} exists, skipping.")
+        return
+    # Step 1: Read all unique recording paths
+    recordings_path_set = set()
+    supervision_list = list()
+    with open(tsv_path, "r") as fr:
+        for line in fr:
+            items = line.strip().split("\t")
+            if len(items) == 3:
+                uniq_id, text, wav_path = items
+                start, end = 0, None
+            elif len(items) == 5:
+                uniq_id, text, wav_path, start, end = items
+                start, end = float(start), float(end)
+            else:
+                raise ValueError(
+                    f"Invalid line format: {line},"
+                    "requries to be 3 columns or 5 columns"
+                )
+            recordings_path_set.add(wav_path)
+            supervision_list.append((uniq_id, text, wav_path, start, end))
+    logging.info("Starting to process recordings...")
+    # Step 2: Process recordings
+    futures = []
+    recording_dict = {}
+    with ThreadPoolExecutor(max_workers=num_jobs) as ex:
+        for wav_path in tqdm(recordings_path_set, desc="Submitting jobs"):
+            futures.append(ex.submit(_parse_recording, wav_path))
+        for future in tqdm(futures, desc="Processing recordings"):
+            try:
+                recording, recording_id = future.result()
+                recording_dict[recording_id] = recording
+            except Exception as e:
+                logging.warning(
+                    f"Error processing recording {recording_id} with error: {e}"
+                )
+        recording_set = RecordingSet.from_recordings(recording_dict.values())
+    logging.info("Starting to process supervisions...")
+    # Step 3: Process supervisions
+    supervisions = []
+    for supervision in tqdm(supervision_list, desc="Processing supervisions"):
+        seg = _parse_supervision(supervision, recording_dict)
+        if seg is not None:
+            supervisions.append(seg)
+    logging.info("Processing Cuts...")
+    # Step 4: Create and validate manifests
+    supervision_set = SupervisionSet.from_segments(supervisions)
+    recording_set, supervision_set = fix_manifests(recording_set, supervision_set)
+    validate_recordings_and_supervisions(recording_set, supervision_set)
+    cut_set = CutSet.from_manifests(
+        recordings=recording_set, supervisions=supervision_set
+    )
+    cut_set = cut_set.sort_by_recording_id()
+    cut_set = cut_set.resample(sampling_rate)
+    cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+    logging.info(f"Saving file to {output_dir / file_name}")
+    # Step 5: Write manifests to disk
+    cut_set.to_file(output_dir / file_name)
+    logging.info("Done!")
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_args()
+    prepare_dataset(
+        tsv_path=args.tsv_path,
+        prefix=args.prefix,
+        subset=args.subset,
+        sampling_rate=args.sampling_rate,
+        num_jobs=args.num_jobs,
+        output_dir=args.output_dir,
+    )

zipvoice/bin/prepare_tokens.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+This file reads the texts in given manifest and save the new cuts with prepared tokens.
+"""
+import argparse
+import logging
+from functools import partial
+from pathlib import Path
+from lhotse import load_manifest, split_parallelize_combine
+from zipvoice.tokenizer.tokenizer import add_tokens
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-file",
+        type=str,
+        help="Input manifest without tokens",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        help="Output manifest with tokens.",
+    )
+    parser.add_argument(
+        "--num-jobs",
+        type=int,
+        default=20,
+        help="Number of jobs to run in parallel.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="emilia",
+        help="The destination directory of manifest files.",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default="en-us",
+        help="Language identifier, used when tokenizer type is espeak. see"
+        "https://github.com/rhasspy/espeak-ng/blob/master/docs/languages.md",
+    )
+    return parser.parse_args()
+def prepare_tokens(
+    input_file: Path,
+    output_file: Path,
+    num_jobs: int,
+    tokenizer: str,
+    lang: str = "en-us",
+):
+    logging.info(f"Processing {input_file}")
+    if output_file.is_file():
+        logging.info(f"{output_file} exists, skipping.")
+        return
+    logging.info(f"loading manifest from {input_file}")
+    cut_set = load_manifest(input_file)
+    _add_tokens = partial(add_tokens, tokenizer=tokenizer, lang=lang)
+    logging.info("Adding tokens")
+    cut_set = split_parallelize_combine(
+        num_jobs=num_jobs, manifest=cut_set, fn=_add_tokens
+    )
+    logging.info(f"Saving file to {output_file}")
+    cut_set.to_file(output_file)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_args()
+    input_file = Path(args.input_file)
+    output_file = Path(args.output_file)
+    num_jobs = args.num_jobs
+    tokenizer = args.tokenizer
+    lang = args.lang
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    prepare_tokens(
+        input_file=input_file,
+        output_file=output_file,
+        num_jobs=num_jobs,
+        tokenizer=tokenizer,
+        lang=lang,
+    )
+    logging.info("Done!")

zipvoice/bin/train_zipvoice.py ADDED Viewed

	@@ -0,0 +1,1136 @@

+#!/usr/bin/env python3
+# Copyright    2024-2025  Xiaomi Corp.        (authors: Wei Kang,
+#                                                       Han Zhu)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script trains a ZipVoice model with the flow-matching loss.
+Usage:
+python3 -m zipvoice.bin.train_zipvoice \
+    --world-size 8 \
+    --use-fp16 1 \
+    --num-epochs 11 \
+    --max-duration 500 \
+    --lr-hours 30000 \
+    --model-config conf/zipvoice_base.json \
+    --tokenizer emilia \
+    --token-file "data/tokens_emilia.txt" \
+    --dataset emilia \
+    --manifest-dir data/fbank \
+    --exp-dir exp/zipvoice
+"""
+import argparse
+import copy
+import json
+import logging
+import os
+from functools import partial
+from pathlib import Path
+from shutil import copyfile
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from lhotse.cut import Cut, CutSet
+from lhotse.utils import fix_random_seed
+from torch import Tensor
+from torch.amp.grad_scaler import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import Optimizer
+from torch.utils.tensorboard import SummaryWriter
+import zipvoice.utils.diagnostics as diagnostics
+from zipvoice.dataset.datamodule import TtsDataModule
+from zipvoice.models.zipvoice import ZipVoice
+from zipvoice.tokenizer.tokenizer import (
+    EmiliaTokenizer,
+    EspeakTokenizer,
+    LibriTTSTokenizer,
+    SimpleTokenizer,
+    SimpleTokenizer2,
+)
+from zipvoice.utils.checkpoint import (
+    load_checkpoint,
+    remove_checkpoints,
+    resume_checkpoint,
+    save_checkpoint,
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from zipvoice.utils.common import (
+    AttributeDict,
+    MetricsTracker,
+    cleanup_dist,
+    create_grad_scaler,
+    get_adjusted_batch_count,
+    get_env_info,
+    get_parameter_groups_with_lrs,
+    prepare_input,
+    set_batch_count,
+    setup_dist,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)
+from zipvoice.utils.hooks import register_inf_check_hooks
+from zipvoice.utils.lr_scheduler import Eden, FixedLRScheduler, LRScheduler
+from zipvoice.utils.optim import ScaledAdam
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, LRScheduler]
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12356,
+        help="Master port to use for DDP training.",
+    )
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=11,
+        help="Number of epochs to train.",
+    )
+    parser.add_argument(
+        "--num-iters",
+        type=int,
+        default=0,
+        help="Number of iter to train, will ignore num_epochs if > 0.",
+    )
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="""Checkpoints of pre-trained models, will load it if not None
+        """,
+    )
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="exp/zipvoice",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+    parser.add_argument(
+        "--base-lr", type=float, default=0.02, help="The base learning rate."
+    )
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=7500,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=10,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+    parser.add_argument(
+        "--lr-hours",
+        type=float,
+        default=0,
+        help="""If positive, --epoch is ignored and it specifies the number of hours
+        that affects how rapidly the learning rate decreases.
+        """,
+    )
+    parser.add_argument(
+        "--ref-duration",
+        type=float,
+        default=50,
+        help="""Reference batch duration for purposes of adjusting batch counts for"
+        setting various schedules inside the model".
+        """,
+    )
+    parser.add_argument(
+        "--finetune",
+        type=str2bool,
+        default=False,
+        help="Whether to use the fine-tuning mode, will used a fixed learning rate "
+        "schedule and skip the large dropout phase.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+    parser.add_argument(
+        "--scan-oom",
+        type=str2bool,
+        default=False,
+        help="Scan pessimistic batches to see whether they cause OOMs.",
+    )
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=5000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 1.
+        """,
+    )
+    parser.add_argument(
+        "--valid-by-epoch",
+        type=str2bool,
+        default=False,
+        help="""Whether to validate after each epoch. If False, will validate
+        after every save_every_n iterations.
+        """,
+    )
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=30,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=True,
+        help="Whether to use half precision training.",
+    )
+    parser.add_argument(
+        "--feat-scale",
+        type=float,
+        default=0.1,
+        help="The scale factor of fbank feature",
+    )
+    parser.add_argument(
+        "--condition-drop-ratio",
+        type=float,
+        default=0.2,
+        help="The drop rate of text condition during training.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="emilia",
+        choices=["emilia", "libritts", "custom"],
+        help="The used training dataset",
+    )
+    parser.add_argument(
+        "--train-manifest",
+        type=str,
+        help="Path of the training manifest",
+    )
+    parser.add_argument(
+        "--dev-manifest",
+        type=str,
+        help="Path of the validation manifest",
+    )
+    parser.add_argument(
+        "--min-len",
+        type=float,
+        default=1.0,
+        help="The minimum audio length used for training",
+    )
+    parser.add_argument(
+        "--max-len",
+        type=float,
+        default=30.0,
+        help="The maximum audio length used for training",
+    )
+    parser.add_argument(
+        "--model-config",
+        type=str,
+        default="conf/zipvoice_base.json",
+        help="The model configuration file.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default="emilia",
+        help="Tokenizer type.",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default="en-us",
+        help="Language identifier, used when tokenizer type is espeak. see"
+        "https://github.com/rhasspy/espeak-ng/blob/master/docs/languages.md",
+    )
+    parser.add_argument(
+        "--token-file",
+        type=str,
+        default="data/tokens_emilia.txt",
+        help="The file that contains information that maps tokens to ids,"
+        "which is a text file with '{token}\t{token_id}' per line.",
+    )
+    return parser
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+    Explanation of options saved in `params`:
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+        - best_train_epoch: It is the epoch that has the best training loss.
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+        - env_info:  A dict containing information about the environment.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "env_info": get_env_info(),
+        }
+    )
+    return params
+def compute_fbank_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    features: Tensor,
+    features_lens: Tensor,
+    tokens: List[List[int]],
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute loss given the model and its inputs.
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training.
+      features:
+        The target acoustic feature.
+      features_lens:
+        The number of frames of each utterance.
+      tokens:
+        Input tokens that representing the transcripts.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    batch_size, num_frames, _ = features.shape
+    features = torch.nn.functional.pad(
+        features, (0, 0, 0, num_frames - features.size(1))
+    )  # (B, T, F)
+    noise = torch.randn_like(features)  # (B, T, F)
+    # Sampling t from uniform distribution
+    if is_training:
+        t = torch.rand(batch_size, 1, 1, device=device)
+    else:
+        t = (
+            (torch.arange(batch_size, device=device) / batch_size)
+            .unsqueeze(1)
+            .unsqueeze(2)
+        )
+    with torch.set_grad_enabled(is_training):
+        loss = model(
+            tokens=tokens,
+            features=features,
+            features_lens=features_lens,
+            noise=noise,
+            t=t,
+            condition_drop_ratio=params.condition_drop_ratio,
+        )
+    assert loss.requires_grad == is_training
+    info = MetricsTracker()
+    num_frames = features_lens.sum().item()
+    info["frames"] = num_frames
+    info["loss"] = loss.detach().cpu().item() * num_frames
+    return loss, info
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: Optimizer,
+    scheduler: LRSchedulerType,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches or every epochs.
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer.
+      scheduler:
+        The learning rate scheduler, we call step() every epoch.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    # used to track the stats over iterations in one epoch
+    tot_loss = MetricsTracker()
+    saved_bad_model = False
+    def save_bad_model(suffix: str = ""):
+        save_checkpoint(
+            filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
+            model=model,
+            model_avg=model_avg,
+            params=params,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=0,
+        )
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx % 10 == 0:
+            if params.finetune:
+                set_batch_count(model, get_adjusted_batch_count(params) + 100000)
+            else:
+                set_batch_count(model, get_adjusted_batch_count(params))
+        if (
+            params.valid_by_epoch and batch_idx == 0 and not params.print_diagnostics
+        ) or (
+            not params.valid_by_epoch
+            and params.batch_idx_train % params.valid_interval == 0
+            and not params.print_diagnostics
+        ):
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(
+                f"Epoch {params.cur_epoch}, global_batch_idx: {params.batch_idx_train},"
+                f" validation: {valid_info}"
+            )
+            logging.info(
+                f"Maximum memory allocated so far is "
+                f"{torch.cuda.max_memory_allocated() // 1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+        params.batch_idx_train += 1
+        batch_size = len(batch["text"])
+        tokens, features, features_lens = prepare_input(
+            params=params,
+            batch=batch,
+            device=device,
+            return_tokens=True,
+            return_feature=True,
+        )
+        try:
+            with torch_autocast(dtype=torch.float16, enabled=params.use_fp16):
+                loss, loss_info = compute_fbank_loss(
+                    params=params,
+                    model=model,
+                    features=features,
+                    features_lens=features_lens,
+                    tokens=tokens,
+                    is_training=True,
+                )
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+            scaler.scale(loss).backward()
+            scheduler.step_batch(params.batch_idx_train)
+            # Use the number of hours of speech to adjust the learning rate
+            if params.lr_hours > 0:
+                scheduler.step_epoch(
+                    params.batch_idx_train
+                    * params.max_duration
+                    * params.world_size
+                    / 3600
+                )
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except Exception as e:
+            logging.info(f"Caught exception : {e}.")
+            save_bad_model()
+            raise
+        if params.print_diagnostics and batch_idx == 5:
+            return
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+        if params.num_iters > 0 and params.batch_idx_train > params.num_iters:
+            break
+        if params.batch_idx_train % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it. The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have
+            # different behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1024.0 or (
+                cur_grad_scale < 4096.0 and params.batch_idx_train % 400 == 0
+            ):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                if not saved_bad_model:
+                    save_bad_model(suffix="-first-warning")
+                    saved_bad_model = True
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                save_bad_model()
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+        if params.batch_idx_train % params.log_interval == 0:
+            cur_lr = max(scheduler.get_last_lr())
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+            logging.info(
+                f"Epoch {params.cur_epoch}, batch {batch_idx}, "
+                f"global_batch_idx: {params.batch_idx_train}, "
+                f"batch size: {batch_size}, "
+                f"loss[{loss_info}], tot_loss[{tot_loss}], "
+                f"cur_lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale",
+                        cur_grad_scale,
+                        params.batch_idx_train,
+                    )
+    loss_value = tot_loss["loss"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    # used to summary the stats over iterations
+    tot_loss = MetricsTracker()
+    for batch_idx, batch in enumerate(valid_dl):
+        tokens, features, features_lens = prepare_input(
+            params=params,
+            batch=batch,
+            device=device,
+            return_tokens=True,
+            return_feature=True,
+        )
+        loss, loss_info = compute_fbank_loss(
+            params=params,
+            model=model,
+            features=features,
+            features_lens=features_lens,
+            tokens=tokens,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+    loss_value = tot_loss["loss"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+    return tot_loss
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+    features = batch["features"]
+    tokens = batch["tokens"]
+    logging.info(f"features shape: {features.shape}")
+    num_tokens = sum(len(i) for i in tokens)
+    logging.info(f"num tokens: {num_tokens}")
+def scan_pessimistic_batches_for_oom(
+    model: Union[nn.Module, DDP],
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    params: AttributeDict,
+):
+    from lhotse.dataset import find_pessimistic_batches
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
+    )
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        tokens, features, features_lens = prepare_input(
+            params=params,
+            batch=batch,
+            device=device,
+            return_tokens=True,
+            return_feature=True,
+        )
+        try:
+            with torch_autocast(dtype=torch.float16, enabled=params.use_fp16):
+                loss, loss_info = compute_fbank_loss(
+                    params=params,
+                    model=model,
+                    features=features,
+                    features_lens=features_lens,
+                    tokens=tokens,
+                    is_training=True,
+                )
+            loss.backward()
+            optimizer.zero_grad()
+        except Exception as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            display_and_save_batch(batch, params=params)
+            raise
+        logging.info(
+            f"Maximum memory allocated so far is "
+            f"{torch.cuda.max_memory_allocated() // 1000000}MB"
+        )
+def tokenize_text(c: Cut, tokenizer):
+    if hasattr(c.supervisions[0], "tokens"):
+        tokens = tokenizer.tokens_to_token_ids([c.supervisions[0].tokens])
+    else:
+        tokens = tokenizer.texts_to_token_ids([c.supervisions[0].text])
+        print("ko tìm được tokens")
+    c.supervisions[0].tokens = tokens[0]
+    return c
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+    params.valid_interval = params.save_every_n
+    # Set epoch to a large number to ignore it.
+    if params.num_iters > 0:
+        params.num_epochs = 1000000
+    with open(params.model_config, "r") as f:
+        model_config = json.load(f)
+    params.update(model_config["model"])
+    params.update(model_config["feature"])
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+    os.makedirs(f"{params.exp_dir}", exist_ok=True)
+    copyfile(src=params.model_config, dst=f"{params.exp_dir}/model.json")
+    copyfile(src=params.token_file, dst=f"{params.exp_dir}/tokens.txt")
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+    if torch.cuda.is_available():
+        params.device = torch.device("cuda", rank)
+    else:
+        params.device = torch.device("cpu")
+    logging.info(f"Device: {params.device}")
+    if params.tokenizer == "emilia":
+        tokenizer = EmiliaTokenizer(token_file=params.token_file)
+    elif params.tokenizer == "libritts":
+        tokenizer = LibriTTSTokenizer(token_file=params.token_file)
+    elif params.tokenizer == "espeak":
+        tokenizer = EspeakTokenizer(token_file=params.token_file, lang=params.lang)
+    elif params.tokenizer == "simple2":
+        tokenizer = SimpleTokenizer2(token_file=params.token_file)
+    else:
+        assert params.tokenizer == "simple"
+        tokenizer = SimpleTokenizer(token_file=params.token_file)
+    tokenizer_config = {"vocab_size": tokenizer.vocab_size, "pad_id": tokenizer.pad_id}
+    params.update(tokenizer_config)
+    logging.info(params)
+    logging.info("About to create model")
+    model = ZipVoice(
+        **model_config["model"],
+        **tokenizer_config,
+    )
+    if params.checkpoint is not None:
+        logging.info(f"Loading pre-trained model from {params.checkpoint}")
+        _ = load_checkpoint(filename=params.checkpoint, model=model, strict=True)
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of parameters : {num_param}")
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+    assert params.start_epoch > 0, params.start_epoch
+    if params.start_epoch > 1:
+        checkpoints = resume_checkpoint(params=params, model=model, model_avg=model_avg)
+    model = model.to(params.device)
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+    optimizer = ScaledAdam(
+        get_parameter_groups_with_lrs(
+            model,
+            lr=params.base_lr,
+            include_names=True,
+        ),
+        lr=params.base_lr,  # should have no effect
+        clipping_scale=2.0,
+    )
+    assert params.lr_hours >= 0
+    if params.finetune:
+        scheduler = FixedLRScheduler(optimizer)
+    elif params.lr_hours > 0:
+        scheduler = Eden(optimizer, params.lr_batches, params.lr_hours)
+    else:
+        scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
+    if params.start_epoch > 1 and checkpoints is not None:
+        # load state_dict for optimizers
+        if "optimizer" in checkpoints:
+            logging.info("Loading optimizer state dict")
+            optimizer.load_state_dict(checkpoints["optimizer"])
+        # load state_dict for schedulers
+        if "scheduler" in checkpoints:
+            logging.info("Loading scheduler state dict")
+            scheduler.load_state_dict(checkpoints["scheduler"])
+        if "grad_scaler" in checkpoints:
+            logging.info("Loading grad scaler state dict")
+            scaler.load_state_dict(checkpoints["grad_scaler"])
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            512
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+    if params.inf_check:
+        register_inf_check_hooks(model)
+    def remove_short_and_long_utt(c: Cut, min_len: float, max_len: float):
+        if c.duration < min_len or c.duration > max_len:
+            return False
+        return True
+    _remove_short_and_long_utt = partial(
+        remove_short_and_long_utt, min_len=params.min_len, max_len=params.max_len
+    )
+    datamodule = TtsDataModule(args)
+    if params.dataset == "emilia":
+        train_cuts = CutSet.mux(
+            datamodule.train_emilia_EN_cuts(),
+            datamodule.train_emilia_ZH_cuts(),
+            weights=[46000, 49000],
+        )
+        train_cuts = train_cuts.filter(_remove_short_and_long_utt)
+        dev_cuts = CutSet.mux(
+            datamodule.dev_emilia_EN_cuts(),
+            datamodule.dev_emilia_ZH_cuts(),
+            weights=[0.5, 0.5],
+        )
+    elif params.dataset == "libritts":
+        train_cuts = datamodule.train_libritts_cuts()
+        train_cuts = train_cuts.filter(_remove_short_and_long_utt)
+        dev_cuts = datamodule.dev_libritts_cuts()
+    else:
+        assert params.dataset == "custom"
+        train_cuts = datamodule.train_custom_cuts(params.train_manifest)
+        train_cuts = train_cuts.filter(_remove_short_and_long_utt)
+        dev_cuts = datamodule.dev_custom_cuts(params.dev_manifest)
+        # To avoid OOM issues due to too long dev cuts
+        dev_cuts = dev_cuts.filter(_remove_short_and_long_utt)
+    if params.tokenizer in ["emilia", "espeak", "dialog"]:
+        if not hasattr(train_cuts[0].supervisions[0], "tokens") or not hasattr(
+            dev_cuts[0].supervisions[0], "tokens"
+        ):
+            logging.warning(
+                f"Using {params.tokenizer} tokenizer but tokens are not prepared,"
+                f"will tokenize on-the-fly, which can slow down training significantly."
+            )
+    _tokenize_text = partial(tokenize_text, tokenizer=tokenizer)
+    train_cuts = train_cuts.map(_tokenize_text)
+    dev_cuts = dev_cuts.map(_tokenize_text)
+    train_dl = datamodule.train_dataloaders(train_cuts)
+    valid_dl = datamodule.dev_dataloaders(dev_cuts)
+    if params.scan_oom:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            params=params,
+        )
+    logging.info("Training started")
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        logging.info(f"Start epoch {epoch}")
+        if params.lr_hours == 0:
+            scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+        params.cur_epoch = epoch
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+        if params.num_iters > 0 and params.batch_idx_train > params.num_iters:
+            break
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+        filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+        save_checkpoint(
+            filename=filename,
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+        if rank == 0:
+            if params.best_train_epoch == params.cur_epoch:
+                best_train_filename = params.exp_dir / "best-train-loss.pt"
+                copyfile(src=filename, dst=best_train_filename)
+            if params.best_valid_epoch == params.cur_epoch:
+                best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+                copyfile(src=filename, dst=best_valid_filename)
+    logging.info("Done!")
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+def main():
+    parser = get_parser()
+    TtsDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+if __name__ == "__main__":
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    main()