mie237 commited on 22 days ago

Commit

bedfeec

verified ·

1 Parent(s): dfc0841

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitattributes +1 -0
LICENSE +201 -0
README.md +93 -62
README_zh.md +87 -57
attention.py +187 -0
config.json +63 -0
configuration_dasheng_audiogen.py +99 -0
content_adapter.py +115 -0
dit.py +1153 -0
model.safetensors +2 -2
modeling_dasheng_audiogen.py +549 -0
modules.py +218 -0
scheduler.py +61 -0
special_tokens_map.json +23 -0
spiece.model +3 -0
tokenizer.json +3 -0
tokenizer_config.json +840 -0
utils.py +50 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2026] [MiLM Plus, Xiaomi Inc.]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,4 +1,5 @@
 ---
 language:
   - en
   - es
@@ -8,40 +9,38 @@ language:
   - ja
   - ko
   - de
-license: apache-2.0
 tags:
   - audio-generation
   - text-to-audio
-  - speech-synthesis
-  - music-generation
   - sound-effects
-  - flow-matching
-  - diffusion-transformer
   - multilingual
 pipeline_tag: text-to-audio
 ---
 # Dasheng-AudioGen-Multilingual
-[**English**](./README.md) | [**中文**](./README_zh.md)
-**Dasheng-AudioGen-Multilingual** is the multilingual variant of [Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen). It replaces the text encoder with `google/mt5-large`, enabling text-to-audio generation from prompts in multiple languages.
-- GitHub: [https://github.com/xiaomi-research/dasheng-audiogen](https://github.com/xiaomi-research/dasheng-audiogen)
-- Demo: [https://huggingface.co/spaces/mispeech/Dasheng-AudioGen](https://huggingface.co/spaces/mispeech/Dasheng-AudioGen)
-- Web Demo: [https://nieeim.github.io/Dasheng-AudioGen-Web/](https://nieeim.github.io/Dasheng-AudioGen-Web/)
-- Base model: [mispeech/Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen)
-## Differences from Base Model
-| | Dasheng-AudioGen | Dasheng-AudioGen-Multilingual |
-|---|---|---|
-| Text encoder | `google/flan-t5-large` | `google/mt5-large` |
-| Language support | English | Multilingual |
-## Supported Languages
-Training data language distribution:
 | Language | Duration (h) | Proportion |
 |----------|------------:|----------:|
@@ -55,76 +54,108 @@ Training data language distribution:
 | German | 842.29 | 3.23% |
 | Other | 1,369.16 | 5.24% |
-> **Note:** The current multilingual model has notably higher synthesis error rates for all non-English languages. Languages outside the table above are even less reliable. For English-only use cases, the base model ([mispeech/Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen)) is recommended.
-## Files
-| File | Description |
-|------|-------------|
-| `model.safetensors` | Model weights (~8.2 GB) |
-| `config.yaml` | Model architecture configuration |
-## Usage
-### Installation
 ```bash
-git clone https://github.com/xiaomi-research/dasheng-audiogen.git
-cd dasheng-audiogen
-conda create -n dasheng-audiogen python=3.10
-conda activate dasheng-audiogen
-pip install -r requirements.txt
 ```
-> torch 2.8.0+cu128 is recommended.
-### Python API
 ```python
-from dasheng_audiogen.pipeline import DashengAudioGenPipeline
-pipe = DashengAudioGenPipeline(
-    model_name_or_path="mispeech/Dasheng-AudioGen-Multilingual"
-)
-# Spanish speech example (only <asr> uses the target language)
-prompt = pipe.compose_prompt(
     caption="A conversation scene on a busy city street.",
     speech="A young woman speaking softly in Spanish.",
-    asr="Creo que deberíamos irnos ya.",
     env="Rain and distant traffic noise.",
 )
-waveforms = pipe.generate(prompts=[prompt])
-pipe.save_waveform(waveforms[0], "output.wav")
 ```
-### CLI
-```bash
-python inference_cli.py infer \
-  --model_name_or_path mispeech/Dasheng-AudioGen-Multilingual \
-  --content "<|caption|> A conversation scene on a busy city street. <|speech|> A young woman speaking softly in Spanish. <|asr|> Creo que deberíamos irnos ya. <|env|> Rain and distant traffic noise." \
-  --output_path ./outputs/multilingual.wav
 ```
-## Prompt Tags
 | Tag | Description |
 |-----|-------------|
-| `<\|caption\|>` | Overall audio scene |
-| `<\|speech\|>` | Speaker identity and style |
-| `<\|asr\|>` | Spoken transcript |
 | `<\|sfx\|>` | Sound effects |
 | `<\|music\|>` | Background music |
 | `<\|env\|>` | Environmental ambience |
-> **Prompt convention:** All descriptive tags (`caption`, `speech`, `sfx`, `music`, `env`) should be written in **English**. Only `<|asr|>` (the spoken content to synthesize) should use the target language.
-## Dependencies
-- Audio tokenizer: [mispeech/dashengtokenizer](https://huggingface.co/mispeech/dashengtokenizer)
-- Text encoder: [google/mt5-large](https://huggingface.co/google/mt5-large)
-## Acknowledgments
-Developed by **XIAOMI LLM PLUS** and **SJTU X-LANCE**.

 ---
+license: apache-2.0
 language:
   - en
   - es
   - ja
   - ko
   - de
+  - multilingual
 tags:
   - audio-generation
   - text-to-audio
+  - text-to-speech
+  - text-to-music
   - sound-effects
+  - diffusion
   - multilingual
+library_name: transformers
 pipeline_tag: text-to-audio
 ---
 # Dasheng-AudioGen-Multilingual
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-b31b1b?logo=arxiv)](https://arxiv.org/abs/2505.XXXXX)
+[![Hugging Face Model](https://img.shields.io/badge/HuggingFace-Model-orange?logo=huggingface)](https://huggingface.co/mispeech/Dasheng-AudioGen-Multilingual)
+[![Hugging Face Demo](https://img.shields.io/badge/HuggingFace-Demo-orange?logo=huggingface)](https://huggingface.co/spaces/mispeech/Dasheng-AudioGen)
+[![Web Demo](https://img.shields.io/badge/Website-Demo-181717?logo=google-chrome)](https://nieeim.github.io/Dasheng-AudioGen-Web/)
+[**English**](./README.md) | [**中文**](./README_zh.md)
+**Dasheng-AudioGen-Multilingual** is the multilingual variant of Dasheng-AudioGen, a unified audio generation model that can jointly synthesize **intelligible speech, music, sound effects, and environmental acoustics** from text descriptions.
+## Models
+| Model | HuggingFace | Text Encoder | Language |
+|-------|-------------|-------------|:--------:|
+| Dasheng-AudioGen | [mispeech/Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen) | `google/flan-t5-large` | English |
+| Dasheng-AudioGen-Multilingual | [mispeech/Dasheng-AudioGen-Multilingual](https://huggingface.co/mispeech/Dasheng-AudioGen-Multilingual) | `google/mt5-large` | Multilingual |
+### Language Support
 | Language | Duration (h) | Proportion |
 |----------|------------:|----------:|
 | German | 842.29 | 3.23% |
 | Other | 1,369.16 | 5.24% |
+> **Note:** The current multilingual model has notably higher synthesis error rates for all non-English languages. Languages outside the table above are even less reliable. For English-only use cases, the base model (`mispeech/Dasheng-AudioGen`) is recommended.
+## Installation
 ```bash
+pip install torch torchaudio "transformers<5" einops
 ```
+> Tested with Python 3.10, torch 2.8.0+cu128, transformers 4.57. Not compatible with transformers 5.x.
+## Quick Start
+### Basic Usage
 ```python
+import torchaudio
+from transformers import AutoModel
+model = AutoModel.from_pretrained("mispeech/Dasheng-AudioGen-Multilingual", trust_remote_code=True).cuda()
+audio = model.generate("A dog barking in a park")
+torchaudio.save("output.wav", audio.cpu(), 16000)
+```
+### Aspect-wise Prompt
+Use `compose_prompt` to describe different audio aspects separately:
+```python
+prompt = model.compose_prompt(
     caption="A conversation scene on a busy city street.",
     speech="A young woman speaking softly in Spanish.",
     env="Rain and distant traffic noise.",
+    asr="Creo que deberíamos irnos ya.",
 )
+audio = model.generate(prompt)
+torchaudio.save("output.wav", audio.cpu(), 16000)
 ```
+You can also pass a pre-formatted string with tags directly:
+```python
+audio = model.generate(
+    "<|caption|> A helicopter passing overhead. <|sfx|> Rhythmic helicopter blade sounds. <|env|> Open sky ambience."
+)
+```
+### Batch Inference
+```python
+prompts = [
+    model.compose_prompt(caption="A cat meowing softly.", sfx="Soft cat meow."),
+    model.compose_prompt(caption="Thunder rolling in the distance.", env="Stormy night ambience."),
+    model.compose_prompt(caption="A piano playing a gentle melody.", music="Soft piano ballad."),
+]
+audios = model.generate(prompts)
+for i, audio in enumerate(audios):
+    torchaudio.save(f"output_{i}.wav", audio.unsqueeze(0).cpu(), 16000)
+```
+### Generation Parameters
+```python
+audio = model.generate(
+    prompts="A dog barking in a park",
+    num_steps=25,              # number of denoising steps (default: 25)
+    guidance_scale=5.0,        # classifier-free guidance scale (default: 5.0)
+    sway_sampling_coef=-1.0,   # sway sampling coefficient (default: -1.0, 0 for linear)
+)
 ```
+## Prompt Format
+Dasheng-AudioGen uses structured tags to describe different audio aspects:
 | Tag | Description |
 |-----|-------------|
+| `<\|caption\|>` | Overall audio scene description |
+| `<\|speech\|>` | Speaker identity and speaking style |
+| `<\|asr\|>` | Spoken transcript / dialogue |
 | `<\|sfx\|>` | Sound effects |
 | `<\|music\|>` | Background music |
 | `<\|env\|>` | Environmental ambience |
+> **Multilingual prompt convention:** All descriptive tags (`caption`, `speech`, `sfx`, `music`, `env`) should be written in **English**. Only the `<|asr|>` field (the actual spoken content to be synthesized) should use the target language.
+## Acknowledgments
+Dasheng-AudioGen was developed with contributions from **XIAOMI LLM PLUS** and **SJTU X-LANCE**.
+## Citation
+```bibtex
+@article{dasheng-audiogen,
+  title={Dasheng-AudioGen},
+  author={},
+  journal={arXiv preprint arXiv:2505.XXXXX},
+  year={2025}
+}
+```
+## License
+This project is released under the [Apache License 2.0](LICENSE).

README_zh.md CHANGED Viewed

@@ -1,24 +1,22 @@
 # Dasheng-AudioGen-Multilingual
-[**English**](./README.md) | [**中文**](./README_zh.md)
-**Dasheng-AudioGen-Multilingual** 是 [Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen) 的多语言版本。它将文本编码器替换为 `google/mt5-large`，支持使用多种语言的 prompt 进行音频生成。
-- GitHub: [https://github.com/xiaomi-research/dasheng-audiogen](https://github.com/xiaomi-research/dasheng-audiogen)
-- Demo: [https://huggingface.co/spaces/mispeech/Dasheng-AudioGen](https://huggingface.co/spaces/mispeech/Dasheng-AudioGen)
-- Web Demo: [https://nieeim.github.io/Dasheng-AudioGen-Web/](https://nieeim.github.io/Dasheng-AudioGen-Web/)
-- 基础模型: [mispeech/Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen)
-## 与基础模型的区别
-| | Dasheng-AudioGen | Dasheng-AudioGen-Multilingual |
-|---|---|---|
-| 文本编码器 | `google/flan-t5-large` | `google/mt5-large` |
-| 语言支持 | 英语 | 多语言 |
-## 支持语言
-训练数据语言分布：
 | 语言 | 时长 (h) | 占比 |
 |------|--------:|-----:|
@@ -32,76 +30,108 @@
 | 德语 (German) | 842.29 | 3.23% |
 | 其他 | 1,369.16 | 5.24% |
-> **注意：** 当前多语言模型在所有非英语语言上的合成错误率都明显偏高，表中未列出的语言更不稳定。如果仅需英语生成，建议使用基础模型 ([mispeech/Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen))。
-## 文件说明
-| 文件 | 描述 |
-|------|------|
-| `model.safetensors` | 模型权重 (~8.2 GB) |
-| `config.yaml` | 模型结构配置 |
-## 使用方法
-### 安装
 ```bash
-git clone https://github.com/xiaomi-research/dasheng-audiogen.git
-cd dasheng-audiogen
-conda create -n dasheng-audiogen python=3.10
-conda activate dasheng-audiogen
-pip install -r requirements.txt
 ```
-> 推荐使用 torch 2.8.0+cu128。
-### Python API
 ```python
-from dasheng_audiogen.pipeline import DashengAudioGenPipeline
-pipe = DashengAudioGenPipeline(
-    model_name_or_path="mispeech/Dasheng-AudioGen-Multilingual"
-)
-# 西班牙语语音示例（仅 <asr> 使用目标语言）
-prompt = pipe.compose_prompt(
     caption="A conversation scene on a busy city street.",
     speech="A young woman speaking softly in Spanish.",
-    asr="Creo que deberíamos irnos ya.",
     env="Rain and distant traffic noise.",
 )
-waveforms = pipe.generate(prompts=[prompt])
-pipe.save_waveform(waveforms[0], "output.wav")
 ```
-### 命令行
-```bash
-python inference_cli.py infer \
-  --model_name_or_path mispeech/Dasheng-AudioGen-Multilingual \
-  --content "<|caption|> A conversation scene on a busy city street. <|speech|> A young woman speaking softly in Spanish. <|asr|> Creo que deberíamos irnos ya. <|env|> Rain and distant traffic noise." \
-  --output_path ./outputs/multilingual.wav
 ```
-## Prompt 标签
 | 标签 | 描述 |
 |------|------|
-| `<\|caption\|>` | 整体音频场景 |
-| `<\|speech\|>` | 说话人身份和风格 |
-| `<\|asr\|>` | 语音转写内容 |
 | `<\|sfx\|>` | 音效 |
 | `<\|music\|>` | 背景音乐 |
 | `<\|env\|>` | 环境音 |
-> **Prompt 规范：** 所有描述性标签（`caption`、`speech`、`sfx`、`music`、`env`）应使用**英文**填写，仅 `<|asr|>`（实际要合成的语音内容）使用目标语言。
-## 依赖资源
-- 音频分词器: [mispeech/dashengtokenizer](https://huggingface.co/mispeech/dashengtokenizer)
-- 文本编码器: [google/mt5-large](https://huggingface.co/google/mt5-large)
-## 致谢
-由**小米 LLM PLUS** 和**上海交通大学 X-LANCE** 联合开发。

 # Dasheng-AudioGen-Multilingual
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-b31b1b?logo=arxiv)](https://arxiv.org/abs/2505.XXXXX)
+[![Hugging Face Model](https://img.shields.io/badge/HuggingFace-Model-orange?logo=huggingface)](https://huggingface.co/mispeech/Dasheng-AudioGen-Multilingual)
+[![Hugging Face Demo](https://img.shields.io/badge/HuggingFace-Demo-orange?logo=huggingface)](https://huggingface.co/spaces/mispeech/Dasheng-AudioGen)
+[![Web Demo](https://img.shields.io/badge/Website-Demo-181717?logo=google-chrome)](https://nieeim.github.io/Dasheng-AudioGen-Web/)
+[**English**](./README.md) | [**中文**](./README_zh.md)
+**Dasheng-AudioGen-Multilingual** 是 Dasheng-AudioGen 的多语言版本，是一个统一的音频生成模型，能够根据文本描述同时合成**语音、音乐、音效和环境声**。
+## 模型
+| 模型 | HuggingFace | 文本编码器 | 语言支持 |
+|------|-------------|-----------|:--------:|
+| Dasheng-AudioGen | [mispeech/Dasheng-AudioGen](https://huggingface.co/mispeech/Dasheng-AudioGen) | `google/flan-t5-large` | 英语 |
+| Dasheng-AudioGen-Multilingual | [mispeech/Dasheng-AudioGen-Multilingual](https://huggingface.co/mispeech/Dasheng-AudioGen-Multilingual) | `google/mt5-large` | 多语言 |
+### 多语言支持
 | 语言 | 时长 (h) | 占比 |
 |------|--------:|-----:|
 | 德语 (German) | 842.29 | 3.23% |
 | 其他 | 1,369.16 | 5.24% |
+> **注意：** 当前多语言模型在所有非英语语言上的合成错误率都明显偏高，表中未列出的语言更不稳定。如果仅需英语生成，建议使用基础模型 (`mispeech/Dasheng-AudioGen`)。
+## 安装
 ```bash
+pip install torch torchaudio "transformers<5" einops
 ```
+> 已在 Python 3.10、torch 2.8.0+cu128、transformers 4.57 上测试通过。已知不兼容 transformers 5.x。
+## 快速开始
+### 基本用法
 ```python
+import torchaudio
+from transformers import AutoModel
+model = AutoModel.from_pretrained("mispeech/Dasheng-AudioGen-Multilingual", trust_remote_code=True).cuda()
+audio = model.generate("A dog barking in a park")
+torchaudio.save("output.wav", audio.cpu(), 16000)
+```
+### 分项 Prompt
+使用 `compose_prompt` 分别描述不同的音频维度：
+```python
+prompt = model.compose_prompt(
     caption="A conversation scene on a busy city street.",
     speech="A young woman speaking softly in Spanish.",
     env="Rain and distant traffic noise.",
+    asr="Creo que deberíamos irnos ya.",
 )
+audio = model.generate(prompt)
+torchaudio.save("output.wav", audio.cpu(), 16000)
 ```
+也可以直接传入包含标签的完整字符串：
+```python
+audio = model.generate(
+    "<|caption|> A helicopter passing overhead. <|sfx|> Rhythmic helicopter blade sounds. <|env|> Open sky ambience."
+)
 ```
+### 批量推理
+```python
+prompts = [
+    model.compose_prompt(caption="A cat meowing softly.", sfx="Soft cat meow."),
+    model.compose_prompt(caption="Thunder rolling in the distance.", env="Stormy night ambience."),
+    model.compose_prompt(caption="A piano playing a gentle melody.", music="Soft piano ballad."),
+]
+audios = model.generate(prompts)
+for i, audio in enumerate(audios):
+    torchaudio.save(f"output_{i}.wav", audio.unsqueeze(0).cpu(), 16000)
+```
+### 生成参数
+```python
+audio = model.generate(
+    prompts="A dog barking in a park",
+    num_steps=25,              # 去噪步数（默认：25）
+    guidance_scale=5.0,        # 无分类器引导强度（默认：5.0）
+    sway_sampling_coef=-1.0,   # sway 采样系数（默认：-1.0，设为 0 使用线性调度）
+)
+```
+## Prompt 格式
+Dasheng-AudioGen 使用结构化标签来描述不同的音频维度：
 | 标签 | 描述 |
 |------|------|
+| `<\|caption\|>` | 整体音频场景描述 |
+| `<\|speech\|>` | 说话人身份和说话风格 |
+| `<\|asr\|>` | 语音转写内容 / 对话文本 |
 | `<\|sfx\|>` | 音效 |
 | `<\|music\|>` | 背景音乐 |
 | `<\|env\|>` | 环境音 |
+> **多语言 prompt 规范：** 使用多语言模型时，所有描述性标签（`caption`、`speech`、`sfx`、`music`、`env`）应使用**英文**填写，仅 `<|asr|>` 字段（实际要合成的语音内容）使用目标语言。
+## 致谢
+Dasheng-AudioGen 由**小米 LLM PLUS** 和 **上海交通大学 X-LANCE** 联合开发。
+## 引用
+```bibtex
+@article{dasheng-audiogen,
+  title={Dasheng-AudioGen},
+  author={},
+  journal={arXiv preprint arXiv:2505.XXXXX},
+  year={2025}
+}
+```
+## 许可证
+本项目基于 [Apache License 2.0](LICENSE) 发布。

attention.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .modules import RMSNorm
+# --- Rotary Position Embeddings ---
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin):
+    cos = cos[:, :, : x.shape[-2], :]
+    sin = sin[:, :, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=-2):
+        seq_len = x.shape[seq_dimension]
+        if (
+            seq_len != self._seq_len_cached
+            or self._cos_cached.device != x.device
+            or self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(self, q, k):
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            q.float(), seq_dimension=-2
+        )
+        if k is not None:
+            return (
+                apply_rotary_pos_emb(q.float(), self._cos_cached, self._sin_cached).type_as(q),
+                apply_rotary_pos_emb(k.float(), self._cos_cached, self._sin_cached).type_as(k),
+            )
+        else:
+            return (
+                apply_rotary_pos_emb(q.float(), self._cos_cached, self._sin_cached).type_as(q),
+                None,
+            )
+# --- Attention Helpers ---
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j = q_shape[0], q_shape[-2], k_shape[-2]
+    q_mask = default(q_mask, torch.ones((b, i), device=device, dtype=torch.bool))
+    k_mask = default(k_mask, torch.ones((b, j), device=device, dtype=torch.bool))
+    attn_mask = rearrange(q_mask, "b i -> b 1 i 1") * rearrange(k_mask, "b j -> b 1 1 j")
+    return attn_mask
+# --- Main Attention Module ---
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        rope_mode="none",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.cross_attn = context_dim is not None
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        if qk_norm is None:
+            self.norm_q = nn.Identity()
+            self.norm_k = nn.Identity()
+        elif qk_norm == "layernorm":
+            self.norm_q = nn.LayerNorm(head_dim)
+            self.norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == "rmsnorm":
+            self.norm_q = RMSNorm(head_dim)
+            self.norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        if self.cross_attn:
+            assert rope_mode == "none"
+        self.rope_mode = rope_mode
+        if self.rope_mode == "shared" or self.rope_mode == "x_only":
+            self.rotary = RotaryEmbedding(dim=head_dim)
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == "shared":
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == "x_only":
+            q_x, k_x = self.rotary(q=q[:, :, extras:, :], k=k[:, :, extras:, :])
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == "none":
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        x = F.scaled_dot_product_attention(
+            q, k, v, dropout_p=self.attn_drop_p if self.training else 0.0,
+            attn_mask=mask_binary,
+        )
+        x = einops.rearrange(x, "B H L D -> B L (H D)")
+        return x
+    def forward(self, x, context=None, context_mask=None, extras=0):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(x.shape, context.shape, x.device, None, context_mask)
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, "B L (H D) -> B H L D", H=self.num_heads)
+        k = einops.rearrange(k, "B L (H D) -> B H L D", H=self.num_heads)
+        v = einops.rearrange(v, "B L (H D) -> B H L D", H=self.num_heads)
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "model_type": "dasheng_audiogen",
+  "architectures": [
+    "DashengAudioGenModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_dasheng_audiogen.DashengAudioGenConfig",
+    "AutoModel": "modeling_dasheng_audiogen.DashengAudioGenModel"
+  },
+  "text_encoder_name": "google/mt5-large",
+  "tokenizer_name": "mispeech/dashengtokenizer",
+  "use_zero_instruction": true,
+  "task_instruction_dim": 1024,
+  "sample_rate": 16000,
+  "downsampling_ratio": 640,
+  "latent_dim": 1280,
+  "content_dim": 1024,
+  "frame_resolution": 0.005,
+  "duration_offset": 1.0,
+  "tokenizer_max_length": 512,
+  "dit_img_size": 1000,
+  "dit_patch_size": 1,
+  "dit_in_chans": 1280,
+  "dit_out_chans": 1280,
+  "dit_input_type": "1d",
+  "dit_embed_dim": 1536,
+  "dit_depth": 32,
+  "dit_num_heads": 24,
+  "dit_mlp_ratio": 4.0,
+  "dit_qk_norm": "layernorm",
+  "dit_norm_layer": "layernorm",
+  "dit_act_layer": "geglu",
+  "dit_context_norm": true,
+  "dit_time_fusion": "ada",
+  "dit_ada_sola_rank": 32,
+  "dit_ada_sola_alpha": 32,
+  "dit_ta_context_dim": 1024,
+  "dit_ta_context_fusion": "add",
+  "dit_ta_context_norm": true,
+  "dit_context_dim": 1024,
+  "dit_context_fusion": "cross",
+  "dit_context_pe_method": "none",
+  "dit_pe_method": "none",
+  "dit_rope_mode": "shared",
+  "adapter_num_heads": 16,
+  "adapter_dropout": 0.2,
+  "adapter_duration_grad_scale": 0.1,
+  "duration_predictor_filter_channels": 512,
+  "duration_predictor_n_layers": 5,
+  "duration_predictor_kernel_size": 3,
+  "duration_predictor_p_dropout": 0.5,
+  "special_tokens": [
+    "<|caption|>",
+    "<|speech|>",
+    "<|sfx|>",
+    "<|music|>",
+    "<|env|>",
+    "<|asr|>",
+    "<|speech_start|>",
+    "<|speech_end|>"
+  ],
+  "train_special_tokens": true
+}

configuration_dasheng_audiogen.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from transformers import PretrainedConfig
+class DashengAudioGenConfig(PretrainedConfig):
+    model_type = "dasheng_audiogen"
+    def __init__(
+        self,
+        text_encoder_name: str = "google/flan-t5-large",
+        tokenizer_name: str = "mispeech/dashengtokenizer",
+        use_zero_instruction: bool = False,
+        task_instruction_dim: int = 1024,
+        sample_rate: int = 16000,
+        downsampling_ratio: int = 640,
+        latent_dim: int = 1280,
+        content_dim: int = 1024,
+        frame_resolution: float = 0.005,
+        duration_offset: float = 1.0,
+        tokenizer_max_length: int = 512,
+        dit_img_size: int = 1000,
+        dit_patch_size: int = 1,
+        dit_in_chans: int = 1280,
+        dit_out_chans: int = 1280,
+        dit_input_type: str = "1d",
+        dit_embed_dim: int = 1536,
+        dit_depth: int = 32,
+        dit_num_heads: int = 24,
+        dit_mlp_ratio: float = 4.0,
+        dit_qk_norm: str = "layernorm",
+        dit_norm_layer: str = "layernorm",
+        dit_act_layer: str = "geglu",
+        dit_context_norm: bool = True,
+        dit_time_fusion: str = "ada",
+        dit_ada_sola_rank: int = 32,
+        dit_ada_sola_alpha: int = 32,
+        dit_ta_context_dim: int = 1024,
+        dit_ta_context_fusion: str = "add",
+        dit_ta_context_norm: bool = True,
+        dit_context_dim: int = 1024,
+        dit_context_fusion: str = "cross",
+        dit_context_pe_method: str = "none",
+        dit_pe_method: str = "none",
+        dit_rope_mode: str = "shared",
+        adapter_num_heads: int = 16,
+        adapter_dropout: float = 0.2,
+        adapter_duration_grad_scale: float = 0.1,
+        duration_predictor_filter_channels: int = 512,
+        duration_predictor_n_layers: int = 5,
+        duration_predictor_kernel_size: int = 3,
+        duration_predictor_p_dropout: float = 0.5,
+        special_tokens: list = None,
+        train_special_tokens: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_encoder_name = text_encoder_name
+        self.tokenizer_name = tokenizer_name
+        self.use_zero_instruction = use_zero_instruction
+        self.task_instruction_dim = task_instruction_dim
+        self.sample_rate = sample_rate
+        self.downsampling_ratio = downsampling_ratio
+        self.latent_dim = latent_dim
+        self.content_dim = content_dim
+        self.frame_resolution = frame_resolution
+        self.duration_offset = duration_offset
+        self.tokenizer_max_length = tokenizer_max_length
+        self.dit_img_size = dit_img_size
+        self.dit_patch_size = dit_patch_size
+        self.dit_in_chans = dit_in_chans
+        self.dit_out_chans = dit_out_chans
+        self.dit_input_type = dit_input_type
+        self.dit_embed_dim = dit_embed_dim
+        self.dit_depth = dit_depth
+        self.dit_num_heads = dit_num_heads
+        self.dit_mlp_ratio = dit_mlp_ratio
+        self.dit_qk_norm = dit_qk_norm
+        self.dit_norm_layer = dit_norm_layer
+        self.dit_act_layer = dit_act_layer
+        self.dit_context_norm = dit_context_norm
+        self.dit_time_fusion = dit_time_fusion
+        self.dit_ada_sola_rank = dit_ada_sola_rank
+        self.dit_ada_sola_alpha = dit_ada_sola_alpha
+        self.dit_ta_context_dim = dit_ta_context_dim
+        self.dit_ta_context_fusion = dit_ta_context_fusion
+        self.dit_ta_context_norm = dit_ta_context_norm
+        self.dit_context_dim = dit_context_dim
+        self.dit_context_fusion = dit_context_fusion
+        self.dit_context_pe_method = dit_context_pe_method
+        self.dit_pe_method = dit_pe_method
+        self.dit_rope_mode = dit_rope_mode
+        self.adapter_num_heads = adapter_num_heads
+        self.adapter_dropout = adapter_dropout
+        self.adapter_duration_grad_scale = adapter_duration_grad_scale
+        self.duration_predictor_filter_channels = duration_predictor_filter_channels
+        self.duration_predictor_n_layers = duration_predictor_n_layers
+        self.duration_predictor_kernel_size = duration_predictor_kernel_size
+        self.duration_predictor_p_dropout = duration_predictor_p_dropout
+        self.special_tokens = special_tokens or []
+        self.train_special_tokens = train_special_tokens

content_adapter.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+import torch.nn as nn
+class LayerNorm(nn.LayerNorm):
+    def __init__(self, nout, dim=-1):
+        super().__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        if self.dim == -1:
+            return super().forward(x)
+        return super().forward(x.transpose(1, -1)).transpose(1, -1)
+class DurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        n_layers: int = 2,
+        kernel_size: int = 3,
+        p_dropout: float = 0.1,
+        padding: str = "SAME"
+    ):
+        super().__init__()
+        self.conv = nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.padding = padding
+        for idx in range(n_layers):
+            in_chans = in_channels if idx == 0 else filter_channels
+            self.conv += [
+                nn.Sequential(
+                    nn.ConstantPad1d(
+                        ((kernel_size - 1) // 2, (kernel_size - 1) // 2)
+                        if padding == 'SAME' else (kernel_size - 1, 0),
+                        0
+                    ),
+                    nn.Conv1d(
+                        in_chans, filter_channels,
+                        kernel_size, stride=1, padding=0
+                    ),
+                    nn.ReLU(),
+                    LayerNorm(filter_channels, dim=1),
+                    nn.Dropout(p_dropout)
+                )
+            ]
+        self.linear = nn.Linear(filter_channels, 1)
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
+        x = x.transpose(1, -1)
+        x_mask = x_mask.unsqueeze(1).to(x.device)
+        for f in self.conv:
+            x = f(x)
+            x = x * x_mask.float()
+        x = self.linear(x.transpose(1, -1)) * x_mask.transpose(1, -1).float()
+        return x
+class ContentAdapterBase(nn.Module):
+    def __init__(self, d_out):
+        super().__init__()
+        self.d_out = d_out
+class CrossAttentionAdapter(ContentAdapterBase):
+    def __init__(
+        self,
+        d_out: int,
+        content_dim: int,
+        prefix_dim: int,
+        num_heads: int,
+        duration_predictor: DurationPredictor,
+        dropout: float = 0.1,
+        duration_grad_scale: float = 0.1,
+    ):
+        super().__init__(d_out)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=content_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            kdim=prefix_dim,
+            vdim=prefix_dim,
+            batch_first=True,
+        )
+        self.duration_grad_scale = duration_grad_scale
+        self.duration_predictor = duration_predictor
+        self.global_duration_mlp = nn.Sequential(
+            nn.Linear(content_dim, content_dim), nn.ReLU(),
+            nn.Dropout(dropout), nn.Linear(content_dim, 1)
+        )
+        self.norm = nn.LayerNorm(content_dim)
+        self.content_proj = nn.Conv1d(content_dim, d_out, 1)
+    def forward(self, content, content_mask, prefix, prefix_mask):
+        attn_output, attn_output_weights = self.attn(
+            query=content,
+            key=prefix,
+            value=prefix,
+            key_padding_mask=~prefix_mask.bool()
+        )
+        attn_output = attn_output * content_mask.unsqueeze(-1).float()
+        x = self.norm(attn_output + content)
+        x_grad_rescaled = x * self.duration_grad_scale + x.detach() * (
+            1 - self.duration_grad_scale
+        )
+        x_aggregated = (
+            x_grad_rescaled * content_mask.unsqueeze(-1).float()
+        ).sum(dim=1) / content_mask.sum(dim=1, keepdim=True).float()
+        global_duration = self.global_duration_mlp(x_aggregated).squeeze(-1)
+        local_duration = self.duration_predictor(
+            x_grad_rescaled, content_mask
+        ).squeeze(-1)
+        content = self.content_proj(x.transpose(1, 2)).transpose(1, 2)
+        return content, content_mask, global_duration, local_duration

dit.py ADDED Viewed

	@@ -0,0 +1,1153 @@

+import math
+import torch
+import torch.nn as nn
+from .modules import (
+    film_modulate,
+    unpatchify,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    FeedForward,
+    RMSNorm,
+)
+from .attention import Attention
+class AdaLN(nn.Module):
+    def __init__(self, dim, ada_mode='ada', r=None, alpha=None):
+        super().__init__()
+        self.ada_mode = ada_mode
+        self.scale_shift_table = None
+        if ada_mode == 'ada':
+            self.time_ada = nn.Linear(dim, 6 * dim, bias=True)
+        elif ada_mode == 'ada_single':
+            self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        elif ada_mode in ['ada_sola', 'ada_sola_bias']:
+            self.lora_a = nn.Linear(dim, r * 6, bias=False)
+            self.lora_b = nn.Linear(r * 6, dim * 6, bias=False)
+            self.scaling = alpha / r
+            if ada_mode == 'ada_sola_bias':
+                self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        else:
+            raise NotImplementedError
+    def forward(self, time_token=None, time_ada=None):
+        if self.ada_mode == 'ada':
+            assert time_ada is None
+            B = time_token.shape[0]
+            time_ada = self.time_ada(time_token).reshape(B, 6, -1)
+        elif self.ada_mode == 'ada_single':
+            B = time_ada.shape[0]
+            time_ada = time_ada.reshape(B, 6, -1)
+            time_ada = self.scale_shift_table[None] + time_ada
+        elif self.ada_mode in ['ada_sola', 'ada_sola_bias']:
+            B = time_ada.shape[0]
+            time_ada_lora = self.lora_b(self.lora_a(time_token)) * self.scaling
+            time_ada = time_ada + time_ada_lora
+            time_ada = time_ada.reshape(B, 6, -1)
+            if self.scale_shift_table is not None:
+                time_ada = self.scale_shift_table[None] + time_ada
+        else:
+            raise NotImplementedError
+        return time_ada
+class DiTBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim=dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            rope_mode=rope_mode
+        )
+        if context_dim is not None:
+            self.use_context = True
+            self.cross_attn = Attention(
+                dim=dim,
+                num_heads=num_heads,
+                context_dim=context_dim,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                rope_mode='none'
+            )
+            self.norm2 = norm_layer(dim)
+            if context_norm:
+                self.norm_context = norm_layer(context_dim)
+            else:
+                self.norm_context = nn.Identity()
+        else:
+            self.use_context = False
+        self.norm3 = norm_layer(dim)
+        self.mlp = FeedForward(
+            dim=dim, mult=mlp_ratio, activation_fn=act_layer, dropout=0
+        )
+        self.use_adanorm = True if time_fusion != 'token' else False
+        if self.use_adanorm:
+            self.adaln = AdaLN(
+                dim,
+                ada_mode=time_fusion,
+                r=ada_sola_rank,
+                alpha=ada_sola_alpha
+            )
+        if skip:
+            self.skip_norm = norm_layer(2 * dim) if skip_norm else nn.Identity()
+            self.skip_linear = nn.Linear(2 * dim, dim)
+        else:
+            self.skip_linear = None
+        self.use_checkpoint = use_checkpoint
+    def forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            from torch.utils.checkpoint import checkpoint
+            return checkpoint(
+                self._forward,
+                x, time_token, time_ada, skip, context, x_mask, context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x, time_token, time_ada, skip, context, x_mask, context_mask,
+                extras
+            )
+    def _forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1 - gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1 - gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class FinalBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        patch_size,
+        in_chans,
+        img_size,
+        input_type='2d',
+        norm_layer=nn.LayerNorm,
+        use_conv=True,
+        use_adanorm=True
+    ):
+        super().__init__()
+        self.in_chans = in_chans
+        self.img_size = img_size
+        self.input_type = input_type
+        self.norm = norm_layer(embed_dim)
+        self.use_adanorm = use_adanorm
+        if input_type == '2d':
+            self.patch_dim = patch_size**2 * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv2d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+        elif input_type == '1d':
+            self.patch_dim = patch_size * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv1d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+    def forward(self, x, time_ada=None, extras=0):
+        B, T, C = x.shape
+        x = x[:, extras:, :]
+        if self.use_adanorm:
+            shift, scale = time_ada.reshape(B, 2, -1).chunk(2, dim=1)
+            x = film_modulate(self.norm(x), shift, scale)
+        else:
+            x = self.norm(x)
+        x = self.linear(x)
+        x = unpatchify(x, self.in_chans, self.input_type, self.img_size)
+        x = self.final_layer(x)
+        return x
+class UDiT(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        if time_fusion == 'token':
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        self.use_skip = skip
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        self.in_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+        self.mid_block = DiTBlock(
+            dim=embed_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def _init_ada(self):
+        if self.time_fusion == 'ada':
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.weight, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+        elif self.time_fusion == 'ada_single':
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+        elif self.time_fusion in ['ada_sola', 'ada_sola_bias']:
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+            nn.init.kaiming_uniform_(
+                self.mid_block.adaln.lora_a.weight, a=math.sqrt(5)
+            )
+            nn.init.constant_(self.mid_block.adaln.lora_b.weight, 0)
+            for block in self.out_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        if self.use_adanorm:
+            self._init_ada()
+        if self.context_cross:
+            for block in self.in_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.weight, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+        if self.cls_embed:
+            if self.use_adanorm:
+                nn.init.constant_(self.cls_embed[-1].weight, 0)
+                nn.init.constant_(self.cls_embed[-1].bias, 0)
+        if self.use_conv:
+            nn.init.xavier_uniform_(self.final_block.final_layer.weight)
+            nn.init.constant_(self.final_block.final_layer.bias, 0)
+    def _concat_x_context(self, x, context, x_mask=None, context_mask=None):
+        assert context.shape[-2] == self.context_max_length
+        B = x.shape[0]
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        x_mask = torch.cat([context_mask, x_mask], dim=1)
+        x = torch.cat((context, x), dim=1)
+        return x, x_mask
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ], dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x
+class LayerFusionDiTBlock(DiTBlock):
+    def __init__(
+        self,
+        dim,
+        ta_context_dim,
+        ta_context_norm=False,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        ta_context_fusion='add',
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__(
+            dim=dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=skip,
+            skip_norm=skip_norm,
+            rope_mode=rope_mode,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.ta_context_fusion = ta_context_fusion
+        self.ta_context_norm = ta_context_norm
+        if self.ta_context_fusion == "add":
+            self.ta_context_projection = nn.Linear(
+                ta_context_dim, dim, bias=False
+            )
+            self.ta_context_norm = norm_layer(
+                ta_context_dim
+            ) if self.ta_context_norm else nn.Identity()
+        elif self.ta_context_fusion == "concat":
+            self.ta_context_projection = nn.Linear(ta_context_dim + dim, dim)
+            self.ta_context_norm = norm_layer(
+                ta_context_dim + dim
+            ) if self.ta_context_norm else nn.Identity()
+    def forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            from torch.utils.checkpoint import checkpoint
+            return checkpoint(
+                self._forward,
+                x, time_aligned_context, time_token, time_ada, skip, context,
+                x_mask, context_mask, extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x, time_aligned_context, time_token, time_ada, skip, context,
+                x_mask, context_mask, extras,
+            )
+    def _forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            tanh_gate_msa = torch.tanh(1 - gate_msa)
+            x = x + tanh_gate_msa * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        if self.ta_context_fusion == "add":
+            time_aligned_context = self.ta_context_projection(
+                self.ta_context_norm(time_aligned_context)
+            )
+            if time_aligned_context.size(1) < x.size(1):
+                time_aligned_context = nn.functional.pad(
+                    time_aligned_context, (0, 0, 1, 0)
+                )
+            x = x + time_aligned_context
+        elif self.ta_context_fusion == "concat":
+            if time_aligned_context.size(1) < x.size(1):
+                time_aligned_context = nn.functional.pad(
+                    time_aligned_context, (0, 0, 1, 0)
+                )
+            cat = torch.cat([x, time_aligned_context], dim=-1)
+            cat = self.ta_context_norm(cat)
+            x = self.ta_context_projection(cat)
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1 - gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class LayerFusionAudioDiT(UDiT):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        ta_context_dim=768,
+        ta_context_fusion='concat',
+        ta_context_norm=True,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        nn.Module.__init__(self)
+        self.num_features = self.embed_dim = embed_dim
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        if time_fusion == 'token':
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        self.use_skip = skip
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        self.in_blocks = nn.ModuleList([
+            LayerFusionDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        self.mid_block = LayerFusionDiTBlock(
+            dim=embed_dim,
+            ta_context_dim=ta_context_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            ta_context_fusion=ta_context_fusion,
+            ta_context_norm=ta_context_norm,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            LayerFusionDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def forward(
+        self,
+        x,
+        timesteps,
+        time_aligned_context,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ], dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_aligned_context=time_aligned_context,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e0ea86afdf5e73d8de8d65ca572e6ada7c59daa122176bcb56c8d27dbd018cd
-size 8742180416

 version https://git-lfs.github.com/spec/v1
+oid sha256:e5c17670507a4d658b7650aef24a3861c4145f4386105ccbc0cba18ab9e28acd
+size 8742184656

modeling_dasheng_audiogen.py ADDED Viewed

	@@ -0,0 +1,549 @@

+from __future__ import annotations
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from transformers import AutoModel, AutoTokenizer, PreTrainedModel
+from .configuration_dasheng_audiogen import DashengAudioGenConfig
+from .modules import *  # noqa: F401,F403 — ensures HF copies this file
+from .attention import *  # noqa: F401,F403 — ensures HF copies this file
+from .dit import LayerFusionAudioDiT
+from .content_adapter import CrossAttentionAdapter, DurationPredictor
+from .scheduler import FlowMatchEulerScheduler, compute_sway_sigmas, compute_linear_sigmas
+from .utils import create_mask_from_length, create_alignment_path, trim_or_pad_length
+# ---------------------------------------------------------------------------
+# Prompt formatting
+# ---------------------------------------------------------------------------
+TAG_ORDER = OrderedDict([
+    ("caption", "<|caption|>"),
+    ("speech", "<|speech|>"),
+    ("asr", "<|asr|>"),
+    ("sfx", "<|sfx|>"),
+    ("music", "<|music|>"),
+    ("env", "<|env|>"),
+])
+def compose_prompt(
+    content: str | None = None,
+    caption: str | None = None,
+    speech: str | None = None,
+    asr: str | None = None,
+    sfx: str | None = None,
+    music: str | None = None,
+    env: str | None = None,
+) -> str:
+    if content is not None:
+        content = str(content).strip()
+        if content:
+            return content
+    values = {
+        "caption": caption, "speech": speech, "asr": asr,
+        "sfx": sfx, "music": music, "env": env,
+    }
+    chunks: list[str] = []
+    for key, tag in TAG_ORDER.items():
+        value = values[key]
+        if value is not None:
+            value = str(value).strip()
+            if value:
+                chunks.append(f"{tag} {value}")
+    if not chunks:
+        raise ValueError(
+            "No prompt content provided. Pass `content` or at least one aspect field."
+        )
+    return " ".join(chunks)
+# ---------------------------------------------------------------------------
+# Model
+# ---------------------------------------------------------------------------
+def _load_text_encoder_backbone(name: str, **kwargs):
+    name_lower = name.lower()
+    if "mt5" in name_lower:
+        from transformers import MT5EncoderModel
+        return MT5EncoderModel.from_pretrained(name, **kwargs)
+    else:
+        from transformers import T5EncoderModel
+        return T5EncoderModel.from_pretrained(name, **kwargs)
+class DashengAudioGenModel(PreTrainedModel):
+    config_class = DashengAudioGenConfig
+    _DYNAMIC_BUFFERS = {"instruction_embedding", "instruction_lengths"}
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict,
+        missing_keys, unexpected_keys, error_msgs,
+    ):
+        for name in self._DYNAMIC_BUFFERS:
+            key = prefix + name
+            if key in state_dict:
+                self.register_buffer(name, state_dict.pop(key))
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs,
+        )
+    def __init__(self, config: DashengAudioGenConfig):
+        super().__init__(config)
+        # -- Backbone (DiT) --
+        self.backbone = LayerFusionAudioDiT(
+            img_size=config.dit_img_size,
+            patch_size=config.dit_patch_size,
+            in_chans=config.dit_in_chans,
+            out_chans=config.dit_out_chans,
+            input_type=config.dit_input_type,
+            embed_dim=config.dit_embed_dim,
+            depth=config.dit_depth,
+            num_heads=config.dit_num_heads,
+            mlp_ratio=config.dit_mlp_ratio,
+            qkv_bias=False,
+            qk_scale=None,
+            qk_norm=config.dit_qk_norm,
+            norm_layer=config.dit_norm_layer,
+            act_layer=config.dit_act_layer,
+            context_norm=config.dit_context_norm,
+            use_checkpoint=False,
+            time_fusion=config.dit_time_fusion,
+            ada_sola_rank=config.dit_ada_sola_rank,
+            ada_sola_alpha=config.dit_ada_sola_alpha,
+            cls_dim=None,
+            ta_context_dim=config.dit_ta_context_dim,
+            ta_context_fusion=config.dit_ta_context_fusion,
+            ta_context_norm=config.dit_ta_context_norm,
+            context_dim=config.dit_context_dim,
+            context_fusion=config.dit_context_fusion,
+            context_max_length=None,
+            context_pe_method=config.dit_context_pe_method,
+            pe_method=config.dit_pe_method,
+            rope_mode=config.dit_rope_mode,
+            use_conv=True,
+            skip=True,
+            skip_norm=True,
+        )
+        # -- Content adapter --
+        duration_predictor = DurationPredictor(
+            in_channels=config.content_dim,
+            filter_channels=config.duration_predictor_filter_channels,
+            n_layers=config.duration_predictor_n_layers,
+            kernel_size=config.duration_predictor_kernel_size,
+            p_dropout=config.duration_predictor_p_dropout,
+        )
+        self.content_adapter = CrossAttentionAdapter(
+            d_out=config.content_dim,
+            content_dim=config.content_dim,
+            prefix_dim=config.task_instruction_dim,
+            num_heads=config.adapter_num_heads,
+            duration_predictor=duration_predictor,
+            dropout=config.adapter_dropout,
+            duration_grad_scale=config.adapter_duration_grad_scale,
+        )
+        # -- Content encoder projection (matches safetensors key path) --
+        _text_enc = nn.Module()
+        _text_enc.proj = nn.Linear(config.content_dim, config.content_dim)
+        if config.special_tokens:
+            _text_enc.special_token_embedding = nn.Embedding(
+                len(config.special_tokens), config.content_dim
+            )
+        _content_enc = nn.Module()
+        _content_enc.text_encoder = _text_enc
+        self.content_encoder = _content_enc
+        # -- Dummy parameters (match safetensors keys) --
+        self.dummy_param = nn.Parameter(torch.empty(0))
+        self.dummy_nta_embed = nn.Parameter(torch.zeros(config.content_dim))
+        self.dummy_ta_embed = nn.Parameter(torch.zeros(config.content_dim))
+        # -- Instruction embedding (actual values loaded from safetensors) --
+        self.register_buffer(
+            "instruction_embedding",
+            torch.zeros(1, 1, config.task_instruction_dim),
+        )
+        self.register_buffer(
+            "instruction_lengths",
+            torch.ones(1, dtype=torch.long),
+        )
+        # -- Scheduler --
+        self.scheduler = FlowMatchEulerScheduler()
+        # -- Derived constants --
+        self.latent_token_rate = config.sample_rate // config.downsampling_ratio
+        # External models are loaded AFTER weight loading in from_pretrained
+        self.text_encoder_backbone = None
+        self.text_tokenizer = None
+        self.audio_tokenizer = None
+        self._special_token_ids = []
+        self._special_token_id_to_index = {}
+        self.post_init()
+    def _load_external_models(self, model_dir: str | None = None, **kwargs):
+        self.text_encoder_backbone = _load_text_encoder_backbone(
+            self.config.text_encoder_name, **kwargs
+        )
+        self.text_encoder_backbone.eval()
+        for p in self.text_encoder_backbone.parameters():
+            p.requires_grad = False
+        import os
+        tokenizer_local = (
+            model_dir
+            if model_dir and os.path.isfile(os.path.join(model_dir, "tokenizer.json"))
+            else None
+        )
+        self.text_tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_local or self.config.text_encoder_name, **kwargs
+        )
+        if self.config.special_tokens:
+            self.text_tokenizer.add_special_tokens(
+                {"additional_special_tokens": self.config.special_tokens}
+            )
+            old_vocab = self.text_encoder_backbone.get_input_embeddings().num_embeddings
+            new_vocab = len(self.text_tokenizer)
+            if new_vocab != old_vocab:
+                self.text_encoder_backbone.resize_token_embeddings(new_vocab)
+            self._special_token_ids = [
+                self.text_tokenizer.convert_tokens_to_ids(t)
+                for t in self.config.special_tokens
+            ]
+            self._special_token_id_to_index = {
+                tid: idx for idx, tid in enumerate(self._special_token_ids)
+            }
+        self.audio_tokenizer = AutoModel.from_pretrained(
+            self.config.tokenizer_name, trust_remote_code=True, **kwargs
+        )
+        self.audio_tokenizer.eval()
+        for p in self.audio_tokenizer.parameters():
+            p.requires_grad = False
+    def _load_dynamic_buffers(self, model_dir: str):
+        import os
+        from safetensors.torch import load_file
+        sf_path = os.path.join(model_dir, "model.safetensors")
+        if not os.path.isfile(sf_path):
+            return
+        state = load_file(sf_path)
+        for name in self._DYNAMIC_BUFFERS:
+            if name in state:
+                self.register_buffer(name, state[name])
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        kwargs.setdefault("ignore_mismatched_sizes", True)
+        model = super().from_pretrained(
+            pretrained_model_name_or_path, *model_args, **kwargs
+        )
+        model._load_dynamic_buffers(str(pretrained_model_name_or_path))
+        ext_kwargs = {}
+        if kwargs.get("local_files_only"):
+            ext_kwargs["local_files_only"] = True
+        model._load_external_models(
+            model_dir=str(pretrained_model_name_or_path), **ext_kwargs
+        )
+        return model
+    @staticmethod
+    def compose_prompt(
+        content: str | None = None,
+        caption: str | None = None,
+        speech: str | None = None,
+        asr: str | None = None,
+        sfx: str | None = None,
+        music: str | None = None,
+        env: str | None = None,
+    ) -> str:
+        return compose_prompt(
+            content=content, caption=caption, speech=speech,
+            asr=asr, sfx=sfx, music=music, env=env,
+        )
+    # ------------------------------------------------------------------
+    # Text encoding
+    # ------------------------------------------------------------------
+    def _get_model_inputs(self, input_ids: torch.Tensor):
+        if not self._special_token_ids:
+            return {"input_ids": input_ids}
+        special_emb = self.content_encoder.text_encoder.special_token_embedding
+        input_embeds = self.text_encoder_backbone.get_input_embeddings()(input_ids)
+        for token_id, token_idx in self._special_token_id_to_index.items():
+            mask = input_ids == token_id
+            if mask.any():
+                input_embeds[mask] = special_emb.weight[token_idx].to(
+                    input_embeds.dtype
+                )
+        return {"inputs_embeds": input_embeds}
+    @torch.no_grad()
+    def encode_text(self, prompts: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
+        device = self.dummy_param.device
+        batch = self.text_tokenizer(
+            prompts,
+            max_length=self.config.tokenizer_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = batch.input_ids.to(device)
+        attention_mask = batch.attention_mask.to(device)
+        model_inputs = self._get_model_inputs(input_ids)
+        output = self.text_encoder_backbone(
+            **model_inputs, attention_mask=attention_mask
+        ).last_hidden_state
+        content = self.content_encoder.text_encoder.proj(output)
+        content_mask = attention_mask.bool()
+        return content, content_mask
+    # ------------------------------------------------------------------
+    # Duration helpers
+    # ------------------------------------------------------------------
+    def _prepare_local_duration(
+        self, pred: torch.Tensor, mask: torch.Tensor
+    ) -> torch.Tensor:
+        pred = torch.exp(pred) * mask
+        pred = torch.ceil(pred) - self.config.duration_offset
+        pred *= self.config.frame_resolution
+        pred = torch.round(pred * self.latent_token_rate)
+        return pred
+    def _prepare_global_duration(
+        self,
+        global_pred: torch.Tensor,
+        local_pred: torch.Tensor,
+        is_time_aligned: torch.Tensor,
+    ) -> torch.Tensor:
+        global_pred = torch.exp(global_pred) - self.config.duration_offset
+        result = torch.round(global_pred * self.latent_token_rate)
+        pred_from_local = local_pred.sum(1)
+        result[is_time_aligned] = pred_from_local[is_time_aligned]
+        return result.long()
+    def _expand_by_duration(
+        self,
+        x: torch.Tensor,
+        content_mask: torch.Tensor,
+        local_duration: torch.Tensor,
+        global_duration: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        latent_length = global_duration
+        latent_mask = create_mask_from_length(latent_length).to(
+            content_mask.device
+        )
+        attn_mask = content_mask.unsqueeze(-1) * latent_mask.unsqueeze(1)
+        align_path = create_alignment_path(local_duration, attn_mask)
+        expanded_x = torch.matmul(
+            align_path.transpose(1, 2).to(x.dtype), x
+        )
+        return expanded_x, latent_mask
+    def _get_backbone_input(
+        self,
+        target_length: int,
+        content: torch.Tensor,
+        content_mask: torch.Tensor,
+        time_aligned_content: torch.Tensor,
+        is_time_aligned: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        time_aligned_content = trim_or_pad_length(
+            time_aligned_content, target_length, 1
+        )
+        # For text_to_audio: length_aligned_content is zeros, so skip addition
+        # Replace non-time-aligned samples with dummy
+        time_aligned_content[~is_time_aligned] = self.dummy_ta_embed.to(
+            time_aligned_content.dtype
+        )
+        context = content.clone()
+        context[is_time_aligned] = self.dummy_nta_embed.to(context.dtype)
+        context_mask = content_mask.detach().clone()
+        context_mask[is_time_aligned, 1:] = False
+        if is_time_aligned.sum().item() < content.size(0):
+            trunc_nta_length = int(
+                content_mask[~is_time_aligned].sum(1).max().item()
+            )
+        else:
+            trunc_nta_length = content.size(1)
+        context = context[:, :trunc_nta_length]
+        context_mask = context_mask[:, :trunc_nta_length]
+        return context, context_mask, time_aligned_content
+    # ------------------------------------------------------------------
+    # Denoising loop
+    # ------------------------------------------------------------------
+    def _iterative_denoise(
+        self,
+        latent: torch.Tensor,
+        timesteps: torch.Tensor,
+        cfg: bool,
+        cfg_scale: float,
+        backbone_input: dict,
+    ) -> torch.Tensor:
+        for timestep in timesteps:
+            if cfg:
+                latent_input = torch.cat([latent, latent])
+            else:
+                latent_input = latent
+            noise_pred: torch.Tensor = self.backbone(
+                x=latent_input, timesteps=timestep, **backbone_input
+            )
+            if cfg:
+                noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + cfg_scale * (
+                    noise_pred_cond - noise_pred_uncond
+                )
+            latent = self.scheduler.step(
+                noise_pred, timestep, latent
+            ).prev_sample
+        return latent
+    # ------------------------------------------------------------------
+    # Main generation entry point
+    # ------------------------------------------------------------------
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompts: str | list[str],
+        num_steps: int = 25,
+        guidance_scale: float = 5.0,
+        sway_sampling_coef: float = -1.0,
+    ) -> torch.Tensor:
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        device = self.dummy_param.device
+        batch_size = len(prompts)
+        classifier_free_guidance = guidance_scale > 1.0
+        # 1. Encode text
+        content, content_mask = self.encode_text(prompts)
+        # 2. Get instruction embedding
+        if self.config.use_zero_instruction:
+            instruction = torch.zeros(
+                1, 1, self.config.task_instruction_dim,
+                device=device, dtype=content.dtype,
+            ).expand(batch_size, -1, -1)
+            instruction_lengths = torch.ones(
+                batch_size, device=device, dtype=torch.long
+            )
+        else:
+            instruction = self.instruction_embedding.to(content.dtype).expand(
+                batch_size, -1, -1
+            )
+            instruction_lengths = self.instruction_lengths.expand(batch_size)
+        # 3. Content adapter
+        instruction_mask = create_mask_from_length(
+            instruction_lengths, max_length=instruction.size(1)
+        ).to(device)
+        (
+            content, content_mask, global_duration_pred, local_duration_pred,
+        ) = self.content_adapter(
+            content, content_mask, instruction, instruction_mask
+        )
+        # 4. Duration
+        is_time_aligned = torch.zeros(
+            batch_size, dtype=torch.bool, device=device
+        )
+        local_latent_duration = self._prepare_local_duration(
+            local_duration_pred, content_mask
+        )
+        global_latent_duration = self._prepare_global_duration(
+            global_duration_pred, local_latent_duration, is_time_aligned
+        )
+        time_aligned_content, latent_mask = self._expand_by_duration(
+            x=content,
+            content_mask=content_mask,
+            local_duration=local_latent_duration,
+            global_duration=global_latent_duration,
+        )
+        # 5. Prepare backbone input
+        context, context_mask, time_aligned_content = self._get_backbone_input(
+            target_length=time_aligned_content.size(1),
+            content=content,
+            content_mask=content_mask,
+            time_aligned_content=time_aligned_content,
+            is_time_aligned=is_time_aligned,
+        )
+        # 6. CFG: duplicate with unconditional
+        if classifier_free_guidance:
+            time_aligned_content = torch.cat([
+                torch.zeros_like(time_aligned_content),
+                time_aligned_content,
+            ])
+            context = torch.cat([
+                torch.zeros_like(context), context
+            ])
+            context_mask = torch.cat([
+                context_mask.detach().clone(), context_mask
+            ])
+            latent_mask = torch.cat([
+                latent_mask.detach().clone(), latent_mask
+            ])
+        # 7. Prepare latent noise
+        latent_length = int(latent_mask.sum(1).max().item())
+        latent = torch.randn(
+            batch_size, self.config.latent_dim, latent_length,
+            device=device, dtype=content.dtype,
+        )
+        # 8. Sigmas schedule
+        if sway_sampling_coef:
+            sigmas = compute_sway_sigmas(num_steps, sway_sampling_coef)
+        else:
+            sigmas = compute_linear_sigmas(num_steps)
+        self.scheduler.set_timesteps(sigmas, device=device)
+        timesteps = self.scheduler.timesteps
+        # 9. Denoise
+        latent = self._iterative_denoise(
+            latent=latent,
+            timesteps=timesteps,
+            cfg=classifier_free_guidance,
+            cfg_scale=guidance_scale,
+            backbone_input={
+                "x_mask": latent_mask,
+                "context": context,
+                "context_mask": context_mask,
+                "time_aligned_context": time_aligned_content,
+            },
+        )
+        # 10. Decode to waveform
+        waveform = self.audio_tokenizer.decode(
+            latent.transpose(1, 2)
+        )
+        if waveform.dim() == 3:
+            waveform = waveform.squeeze(1)
+        return waveform

modules.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import math
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops
+from einops import rearrange
+def trunc_normal_(tensor, mean, std, a, b):
+    def norm_cdf(x):
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        tensor.erfinv_()
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def film_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def timestep_embedding(timesteps, dim, max_period=10000):
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat(
+            [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+        )
+    return embedding
+def unpatchify(x, channels=3, input_type="2d", img_size=None):
+    if input_type == "2d":
+        patch_size = int((x.shape[2] // channels) ** 0.5)
+        h, w = img_size[0] // patch_size, img_size[1] // patch_size
+        x = rearrange(
+            x,
+            "B (h w) (p1 p2 C) -> B C (h p1) (w p2)",
+            h=h,
+            p1=patch_size,
+            p2=patch_size,
+        )
+    elif input_type == "1d":
+        patch_size = int(x.shape[2] // channels)
+        h = x.shape[1]
+        x = rearrange(x, "B h (p1 C) -> B C (h p1)", h=h, p1=patch_size)
+    return x
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256, out_size=None):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(
+            self.mlp[0].weight.dtype
+        )
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class PatchEmbed(nn.Module):
+    def __init__(self, patch_size, in_chans=3, embed_dim=768, input_type="2d"):
+        super().__init__()
+        self.patch_size = patch_size
+        self.input_type = input_type
+        if input_type == "2d":
+            self.proj = nn.Conv2d(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True
+            )
+        elif input_type == "1d":
+            self.proj = nn.Conv1d(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True
+            )
+    def forward(self, x):
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class PE_wrapper(nn.Module):
+    def __init__(self, dim=768, method="abs", length=None, **kwargs):
+        super().__init__()
+        self.method = method
+        if method == "abs":
+            self.length = length
+            self.abs_pe = nn.Parameter(torch.zeros(1, length, dim))
+            trunc_normal_(self.abs_pe, mean=0.0, std=0.02, a=-0.04, b=0.04)
+        elif method == "none":
+            self.id = nn.Identity()
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        if self.method == "abs":
+            _, L, _ = x.shape
+            assert L <= self.length
+            x = x + self.abs_pe[:, :L, :]
+        elif self.method == "none":
+            x = self.id(x)
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class GELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(
+            dtype=gate.dtype
+        )
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        mult=4,
+        dropout=0.0,
+        activation_fn="geglu",
+        final_dropout=False,
+        inner_dim=None,
+        bias=True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        else:
+            raise NotImplementedError
+        self.net = nn.ModuleList([])
+        self.net.append(act_fn)
+        self.net.append(nn.Dropout(dropout))
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states

scheduler.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import math
+from dataclasses import dataclass
+import torch
+@dataclass
+class SchedulerOutput:
+    prev_sample: torch.FloatTensor
+class FlowMatchEulerScheduler:
+    def __init__(self, num_train_timesteps: int = 1000):
+        self.num_train_timesteps = num_train_timesteps
+        self.sigmas = None
+        self.timesteps = None
+        self._step_index = None
+    def set_timesteps(self, sigmas, device):
+        if isinstance(sigmas, (list, tuple)):
+            sigmas = torch.tensor(sigmas, dtype=torch.float32)
+        elif not isinstance(sigmas, torch.Tensor):
+            sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+        sigmas = sigmas.to(device=device)
+        self.timesteps = sigmas * self.num_train_timesteps
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=device)])
+        self._step_index = None
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: torch.FloatTensor,
+        sample: torch.FloatTensor,
+    ) -> SchedulerOutput:
+        if self._step_index is None:
+            self._step_index = (self.timesteps == timestep).nonzero()
+            self._step_index = 0 if self._step_index.numel() == 0 else self._step_index[0].item()
+        sample = sample.to(torch.float32)
+        sigma = self.sigmas[self._step_index]
+        sigma_next = self.sigmas[self._step_index + 1]
+        prev_sample = sample + (sigma_next - sigma) * model_output
+        prev_sample = prev_sample.to(model_output.dtype)
+        self._step_index += 1
+        return SchedulerOutput(prev_sample=prev_sample)
+def compute_sway_sigmas(num_steps: int, sway_sampling_coef: float = -1.0):
+    t = torch.linspace(0, 1, num_steps + 1)
+    t = t + sway_sampling_coef * (torch.cos(math.pi / 2.0 * t) - 1.0 + t)
+    sigmas = 1.0 - t
+    return sigmas
+def compute_linear_sigmas(num_steps: int):
+    return torch.linspace(1.0, 1.0 / num_steps, num_steps)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
+size 4309802

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65c2d7defb6472fada8a935bb364ae3433f7451780c8a59ab6b3cfbaadb32608
+size 16349930

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,840 @@

+{
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250000": {
+      "content": "▁<extra_id_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250001": {
+      "content": "▁<extra_id_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250002": {
+      "content": "▁<extra_id_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250003": {
+      "content": "▁<extra_id_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250004": {
+      "content": "▁<extra_id_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250005": {
+      "content": "▁<extra_id_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250006": {
+      "content": "▁<extra_id_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250007": {
+      "content": "▁<extra_id_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250008": {
+      "content": "▁<extra_id_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250009": {
+      "content": "▁<extra_id_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250010": {
+      "content": "▁<extra_id_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250011": {
+      "content": "▁<extra_id_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250012": {
+      "content": "▁<extra_id_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250013": {
+      "content": "▁<extra_id_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250014": {
+      "content": "▁<extra_id_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250015": {
+      "content": "▁<extra_id_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250016": {
+      "content": "▁<extra_id_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250017": {
+      "content": "▁<extra_id_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250018": {
+      "content": "▁<extra_id_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250019": {
+      "content": "▁<extra_id_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250020": {
+      "content": "▁<extra_id_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250021": {
+      "content": "▁<extra_id_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250022": {
+      "content": "▁<extra_id_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250023": {
+      "content": "▁<extra_id_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250024": {
+      "content": "▁<extra_id_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250025": {
+      "content": "▁<extra_id_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250026": {
+      "content": "▁<extra_id_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250027": {
+      "content": "▁<extra_id_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250028": {
+      "content": "▁<extra_id_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250029": {
+      "content": "▁<extra_id_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250030": {
+      "content": "▁<extra_id_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250031": {
+      "content": "▁<extra_id_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250032": {
+      "content": "▁<extra_id_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250033": {
+      "content": "▁<extra_id_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250034": {
+      "content": "▁<extra_id_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250035": {
+      "content": "▁<extra_id_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250036": {
+      "content": "▁<extra_id_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250037": {
+      "content": "▁<extra_id_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250038": {
+      "content": "▁<extra_id_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250039": {
+      "content": "▁<extra_id_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250040": {
+      "content": "▁<extra_id_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250041": {
+      "content": "▁<extra_id_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250042": {
+      "content": "▁<extra_id_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250043": {
+      "content": "▁<extra_id_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250044": {
+      "content": "▁<extra_id_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250045": {
+      "content": "▁<extra_id_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250046": {
+      "content": "▁<extra_id_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250047": {
+      "content": "▁<extra_id_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250048": {
+      "content": "▁<extra_id_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250049": {
+      "content": "▁<extra_id_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250050": {
+      "content": "���<extra_id_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250051": {
+      "content": "▁<extra_id_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250052": {
+      "content": "▁<extra_id_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250053": {
+      "content": "▁<extra_id_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250054": {
+      "content": "▁<extra_id_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250055": {
+      "content": "▁<extra_id_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250056": {
+      "content": "▁<extra_id_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250057": {
+      "content": "▁<extra_id_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250058": {
+      "content": "▁<extra_id_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250059": {
+      "content": "▁<extra_id_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250060": {
+      "content": "▁<extra_id_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250061": {
+      "content": "▁<extra_id_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250062": {
+      "content": "▁<extra_id_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250063": {
+      "content": "▁<extra_id_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250064": {
+      "content": "▁<extra_id_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250065": {
+      "content": "▁<extra_id_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250066": {
+      "content": "▁<extra_id_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250067": {
+      "content": "▁<extra_id_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250068": {
+      "content": "▁<extra_id_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250069": {
+      "content": "▁<extra_id_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250070": {
+      "content": "▁<extra_id_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250071": {
+      "content": "▁<extra_id_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250072": {
+      "content": "▁<extra_id_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250073": {
+      "content": "▁<extra_id_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250074": {
+      "content": "▁<extra_id_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250075": {
+      "content": "▁<extra_id_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250076": {
+      "content": "▁<extra_id_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250077": {
+      "content": "▁<extra_id_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250078": {
+      "content": "▁<extra_id_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250079": {
+      "content": "▁<extra_id_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250080": {
+      "content": "▁<extra_id_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250081": {
+      "content": "▁<extra_id_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250082": {
+      "content": "▁<extra_id_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250083": {
+      "content": "▁<extra_id_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250084": {
+      "content": "▁<extra_id_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250085": {
+      "content": "▁<extra_id_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250086": {
+      "content": "▁<extra_id_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250087": {
+      "content": "▁<extra_id_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250088": {
+      "content": "▁<extra_id_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250089": {
+      "content": "▁<extra_id_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250090": {
+      "content": "▁<extra_id_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250091": {
+      "content": "▁<extra_id_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250092": {
+      "content": "▁<extra_id_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250093": {
+      "content": "▁<extra_id_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250094": {
+      "content": "▁<extra_id_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250095": {
+      "content": "▁<extra_id_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250096": {
+      "content": "▁<extra_id_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250097": {
+      "content": "▁<extra_id_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250098": {
+      "content": "▁<extra_id_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "250099": {
+      "content": "▁<extra_id_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [],
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+def create_mask_from_length(lengths: torch.Tensor, max_length: int | None = None):
+    lengths = torch.as_tensor(lengths)
+    if lengths.ndim == 0:
+        lengths = lengths.unsqueeze(0)
+    lengths = lengths.long()
+    if max_length is None:
+        if lengths.numel() == 0:
+            max_length = 0
+        else:
+            max_length = int(lengths.max().item())
+    idxs = torch.arange(max_length, device=lengths.device).reshape(1, -1)
+    mask = idxs < lengths.view(-1, 1)
+    return mask
+def convert_pad_shape(pad_shape: list[list[int]]):
+    l = pad_shape[::-1]
+    return [item for sublist in l for item in sublist]
+def create_alignment_path(duration: torch.Tensor, mask: torch.Tensor):
+    device = duration.device
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = create_mask_from_length(cum_duration_flat, t_y).float()
+    path = path.view(b, t_x, t_y)
+    path = path - torch.nn.functional.pad(
+        path, convert_pad_shape([[0, 0], [1, 0], [0, 0]])
+    )[:, :-1]
+    path = path * mask
+    return path
+def trim_or_pad_length(x: torch.Tensor, target_length: int, length_dim: int):
+    current_length = x.shape[length_dim]
+    if current_length > target_length:
+        slices = [slice(None)] * x.ndim
+        slices[length_dim] = slice(0, target_length)
+        return x[tuple(slices)]
+    elif current_length < target_length:
+        pad_shape = list(x.shape)
+        pad_shape[length_dim] = target_length - current_length
+        padding = torch.zeros(pad_shape, dtype=x.dtype, device=x.device)
+        return torch.cat([x, padding], dim=length_dim)
+    return x