Locke commited on 29 days ago

Commit

3e934f1

1 Parent(s): dcc1d9d

code

Browse files

Files changed (30) hide show

.gitattributes +4 -0
LICENSE +21 -0
README.md +388 -0
assets/book.png +3 -0
assets/evaluation.png +3 -0
assets/longcat_logo.svg +1 -0
assets/math1.wav +3 -0
assets/overview.png +3 -0
assets/system_audio.wav +3 -0
assets/vc_zh3.wav +3 -0
config.json +285 -0
configuration_longcat_next.py +152 -0
configuration_longcat_ngram.py +218 -0
cosy24k_vocoder.py +552 -0
environment.yml +8 -0
generation_config.json +40 -0
image_refiner.py +748 -0
modeling_longcat_next.py +824 -0
modeling_longcat_ngram.py +426 -0
modular_longcat_next.py +157 -0
modular_longcat_next_audio.py +2039 -0
modular_longcat_next_visual.py +1077 -0
parse_model_response.py +158 -0
preprocessor_config.json +19 -0
processing_longcat_next.py +279 -0
refiner_modules.py +1330 -0
requirements-post.txt +1 -0
requirements.txt +7 -0
tokenizer.json +0 -0
tokenizer_config.json +2294 -0

.gitattributes CHANGED Viewed

@@ -1,3 +1,7 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

+*.png filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Meituan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,388 @@

+---
+license: mit
+library_name: LongCat-Next
+pipeline_tag: any-to-any
+tags:
+- transformers
+- multimodal
+---
+# LongCat-Next
+<div align="center">
+  <img src="https://raw.githubusercontent.com/meituan-longcat/LongCat-Flash-Chat/main/figures/longcat_logo.svg"
+       width="300"
+       alt="LongCat Logo"/>
+</div>
+<hr>
+<div align="center" style="line-height: 1;">
+    <a href="https://longcat.chat/longcat-next/intro" target="_blank" style="margin: 2px;">
+        <img alt="Blog" src="https://img.shields.io/badge/Blog-LongCatNext-white?logo=safari&logoColor=white&color=purple" style="display: inline-block; vertical-align: middle;"/>
+    </a>
+    <a href="https://huggingface.co/meituan-longcat" target="_blank" style="margin: 2px;">
+        <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-LongCatNext-ffc107?color=ffc107&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
+    </a>
+    <a href="https://github.com/meituan-longcat/LongCat-Next" target="_blank" style="margin: 2px;">
+        <img alt="GitHub" src="https://img.shields.io/badge/GitHub-LongCatNext-white?logo=github&logoColor=white&color=a4b5d5" style="display: inline-block; vertical-align: middle;"/>
+    </a>
+    <a href="https://longcat.chat/longcat-next" target="_blank" style="margin: 2px;">
+        <img alt="Demo" src="https://img.shields.io/badge/Demo-LongCatNext-white?logo=googleplay&logoColor=white&color=eabcdd" style="display: inline-block; vertical-align: middle;"/>
+    </a>
+</div>
+<div align="center" style="line-height: 1;">
+  <a href="https://github.com/meituan-longcat/LongCat-Flash-Chat/blob/main/figures/wechat_official_accounts.png" target="_blank" style="margin: 2px;">
+    <img alt="Wechat" src="https://img.shields.io/badge/WeChat-LongCat-brightgreen?logo=wechat&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
+  </a>
+  <a href="https://x.com/Meituan_LongCat" target="_blank" style="margin: 2px;">
+    <img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-LongCat-white?logo=x&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
+  </a>
+</div>
+<div align="center" style="line-height: 1;">
+  <a href="https://huggingface.co/meituan-longcat/LongCat-Next/blob/main/LICENSE" style="margin: 2px;">
+    <img alt="License" src="https://img.shields.io/badge/License-MIT-f5de53?&color=f5de53" style="display: inline-block; vertical-align: middle;"/>
+  </a>
+</div>
+<p align="center">
+  <a href="https://github.com/meituan-longcat/LongCat-Next/blob/main/tech_report.pdf">
+  <b>Tech Report</b>&nbsp;📄
+  </a>
+</p>
+## Model Introduction
+![evaluation](./assets/overview.png)
+We develop **LongCat-Next**, a native multimodal model that processes text, vision, and audio under a single autoregressive objective with minimal inductive bias beyond the language paradigm. As an industrial-strength foundation model with A3B model size, it excels at seeing, creating, and talking, achieving strong performance across a wide range of multimodal benchmarks. In particular, leveraging semantically complete discrete representations, it surpasses the long-standing performance ceiling of discrete vision modeling on understanding tasks, and provides a unified solution for visual understanding and generation. This success demonstrates that discrete tokens can universally represent multimodal signals and be deeply internalized within a single discrete embedding space. We further provide extensive experiments to analyze this unified discrete training paradigm and uncover several interesting findings.
+As a meaningful attempt toward native multimodality, we open-source the **LongCat-Next** and its tokenizers, hoping to foster further research and development in the community.
+### Key Features
+This work primarily addresses the fundamental barrier to native multimodality through a design philosophy that prioritizes simplicity, treating vision and audio as intrinsic extensions of language. As a step toward this goal, we present LongCat-Next, a discrete native multimodal model that achieves industrial-strength performance within discrete frameworks while remaining highly competitive across a wide range of specialized domains. Built upon the LongCat-Flash-Lite MoE backbone (A3B) as a _multi-task_ learner, the model unifies language, vision, and audio within a single discrete framework. In this paper, we make the following principal contributions:
+#### 🌟  Discrete Native Autoregression Paradigm (DiNA).
+We introduce DiNA, a unified paradigm that extends next-token prediction from language to native multimodality, which internalizes diverse modalities into a shared token space. It simplifies multimodal modeling by creating modality-aware tokenizer-detokenizer pairs and leveraging the established training infrastructure of large language models.
+#### 🌟  Semantic Completeness for Discrete Visual Representation.
+We improve discrete visual modeling by combining Semantic-and-Aligned Encoders (SAE) with Residual Vector Quantization (RVQ). This integration creates hierarchical discrete tokens that preserve both semantic abstraction and fine-grained visual details, surpassing traditional representation limitations.
+#### 🌟  Discrete Native-Resolution Vision Transformer (dNaViT).
+Analogous to linguistic tokenizers, we propose dNaViT as a highly flexible, unified discrete interface for vision that extracts semantic features as "visual words", constructing a hierarchical representation space supporting dynamic tokenization and detokenization. dNaViT integrates seamlessly with large language models, ensuring high performance without degradation.
+#### 🌟  Exceling in Seeing, Creating, and Talking in a Unified Model.
+Within the framework of DiNA, visual understanding and generation are elegantly reformulated as two manifestations of the same predictive process without performance compromise. This formulation bridges the long-standing architectural divide while introducing minimal interference between these traditionally competing objectives and preserving core language capabilities. Remarkably, LongCat-Next achieves competitive performance with specialized understanding models, while maintaining strong generative quality even under a 28× compression ratio, particularly in text rendering, while also excelling in advanced speech comprehension, low-latency voice conversation, and customizable voice cloning.
+Please refer to our [technical report](./tech_report.pdf) for details!
+## Evaluation Results
+![evaluation](./assets/evaluation.png)
+## Quick Start
+To use LongCat-Next with transformers, we need at least 3 GPUs (80GB VRAM each, e.g., H100/A100 80GB), and we recommend the following environment:
+* `python` >= 3.10
+* `torch` >= 2.6
+* `transformers` >= 4.57.6
+* `accelerate` >= 1.10.0
+```shell
+# (Install python=3.10, ffffmpeg<7, soundfile==0.13.1)
+conda env create -f environment.yml -v
+# (Install torch and other pip dependencies)
+pip install -r requirements.txt && pip install -r requirements-post.txt --no-build-isolation
+```
+Basic Usage Example:
+- Remember to modify `WEIGHT_PATH_TO_LONGCAT_NEXT` in `./config.json`, because decoders use lazy loading.
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+# Load model
+model_name = "meituan-longcat/LongCat-Next"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True,
+)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, fix_mistral_regex=True)
+model.text_tokenizer = tokenizer # Dynamic binding
+processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+# Set messages
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What book is this?<longcat_img_start>./assets/book.png<longcat_img_end>"}
+]
+# Apply chat-template
+text_input = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+print(f"{text_input=}")
+# Preprocessing
+text_inputs, visual_inputs, audio_inputs = processor(text=text_input, return_tensors="pt")
+text_inputs = text_inputs.to(model.device)
+if visual_inputs is not None:
+    visual_inputs = visual_inputs.to(model.device)
+if audio_inputs is not None:
+    audio_inputs = audio_inputs.to(model.device)
+# AR
+with torch.no_grad():
+    outputs = model.generate(
+        input_ids=text_inputs["input_ids"],
+        visual_inputs=visual_inputs,
+        audio_inputs=audio_inputs,
+        return_dict_in_generate=True,
+    )
+# Text decoding
+output_input_ids = outputs.sequences
+text_output = tokenizer.decode(output_input_ids[0][len(text_inputs["input_ids"][0]):], skip_special_tokens=True)
+print(f"{text_output=}")
+# Images decoding
+output_visual_ids = outputs.visual_ids
+if output_visual_ids.size(0) > 0:
+    image_path_list = model.model.decode_visual_ids_and_save(
+        output_visual_ids,
+        save_prefix="./output_image",
+        **model.generation_config.visual_generation_config["custom_params"],
+    )
+    print(f"{image_path_list=}")
+# Audio decoding
+output_audio_text_ids = outputs.audio_text_ids
+output_audio_ids = outputs.audio_ids
+if output_audio_text_ids.size(-1) > 0:
+    audio_text = tokenizer.decode(output_audio_text_ids[0], skip_special_tokens=True)
+    print(f"{audio_text=}")
+if output_audio_ids.size(0) > 0:
+    audio_path_list = model.model.decode_audio_ids_and_save(
+        output_audio_ids,
+        save_prefix="./output_audio",
+        **model.generation_config.audio_generation_config["custom_params"],
+    )
+    print(f"{audio_path_list=}")
+```
+<details>
+<summary>Text - Tool Calling Example</summary>
+```python
+from parse_model_response import parse_model_response
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "func_add",
+            "description": "Calculate the sum of two numbers",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "x1": {"type": "number", "description": "The first addend"},
+                    "x2": {"type": "number", "description": "The second addend"}
+                },
+                "required": ["x1", "x2"]
+            }
+        }
+    }
+]
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Please tell me what is $$125679 + 234519$$?"},
+    {
+        "role": "assistant",
+        "content": "I'll calculate the sum of 125679 and 234519 for you.",
+        "tool_calls": [{"type": "function", "function": {"name": "func_add", "arguments": {"x1": 125679, "x2": 234519}}}]
+    },
+    {"role": "tool", "name": "func_add", "content": '{"ans": 360198}'}
+]
+text_input = tokenizer.apply_chat_template(
+    messages,
+    tools=tools, # add tools here
+    tokenize=False,
+    add_generation_prompt=True,
+)
+print(f"{text_input=}")
+# Preprocessing - AR - Text decoding
+...
+# Results parsing
+parsed_message = parse_model_response(text_output.strip("\n"), tools)
+print(f"{parsed_message=}")
+```
+See [`parse_model_response.py`](./parse_model_response.py) for detailed implementation and examples.
+</details>
+<details>
+<summary>Image - Understanding Example</summary>
+```python
+# Simply replace the messages in the main example with the messages below.
+messages = [
+    {"role": "user", "content": "What book is this?<longcat_img_start>./assets/book.png<longcat_img_end>"}
+]
+```
+</details>
+<details>
+<summary>Image - Generation Example</summary>
+```python
+# Simply replace the messages in the main example with the messages below.
+messages = [
+    {"role": "system", "content": ""},
+    {"role": "user", "content": "A small kitten sitting naturally on a moss-covered forest floor, centered in the frame, holding a rectangular wooden sign gently with its front paws resting over the top edge. The kitten has soft, fluffy fur, a natural relaxed posture, and a calm, curious expression with a slightly open mouth (not exaggerated), looking directly at the camera.\n\nThe sign is positioned firmly in front of the kitten\'s chest, supported by its paws, with realistic contact and no floating effect. The board reads \"LongCat-Next: When Modalities Internalize as Multilingual Tokens\" in clean, sharp black text, perfectly legible.\n\nThe environment is a lush forest with tall trees, ferns, and soft green foliage. The ground is covered with moss and small plants. Background softly blurred with natural depth of field. Lighting is soft, diffused sunlight filtering through the trees, creating gentle highlights and shadows. Realistic photography style, natural colors, high detail, no cartoonish exaggeration.<longcat_img_start>"}
+]
+```
+</details>
+<details>
+<summary>Audio - Audio-to-Text Example</summary>
+```python
+# Simply replace the messages in the main example with the messages below.
+messages = [
+    {"role": "user", "content": "<longcat_audio_start>./assets/math1.wav<longcat_audio_end>"}
+]
+```
+</details>
+<details>
+<summary>Audio - Audio-to-Audio Example</summary>
+```python
+# Simply replace the messages in the main example with the messages below.
+messages = [
+    {"role": "system", "content": "Replicate the voice in the audio clip to formulate an answer:<longcat_audio_start>./assets/system_audio.wav<longcat_audio_end>"},
+    {"role": "user", "content": "<longcat_audio_start>./assets/math1.wav<longcat_audio_end><longcat_audiogen_start>"}
+]
+```
+</details>
+<details>
+<summary>Audio - Speech Synthesis Example</summary>
+```python
+# Simply replace the messages in the main example with the messages below.
+messages = [
+    {"role": "system", "content": "Replicate the voice in the audio clip to formulate an answer:<longcat_audio_start>./assets/vc_zh3.wav<longcat_audio_end>"},
+    {"role": "user", "content": "用这个声音合成以下内容：明天的meeting在三楼的Conference Room举行。<longcat_audiogen_start>"}
+]
+```
+</details>
+<!-- > [!Tip] -->
+> We recommend using the following set of sampling parameters for generation:
+>
+> - Text: `{"max_new_tokens":2048,"do_sample":false}`
+> - Image - Understanding: `{"max_new_tokens":1024,"do_sample":true,"temperature":0.4,"top_k":40,"top_p":0.85,"repetition_penalty":1.1}`
+> - Image - Generation: `{"max_new_tokens":2048,"do_sample":false,"visual_generation_config":{"do_sample":true,"temperature":0.5,"top_p":0.75,"top_k":1024,"custom_params":{"cfg_scale":3,"token_h":37,"token_w":37,"anyres_prefix":"<longcat_img_token_size>{h} {w}</longcat_img_token_size>"}}}`
+> - Audio - Audio-to-Text: `{"max_new_tokens":1024,"do_sample":true,"temperature":0.2,"top_k":20,"top_p":0.85,"repetition_penalty":1.1}`
+> - Audio - Audio-to-Audio/Speech Synthesis: `{"max_new_tokens":2048,"do_sample":true,"temperature":0.2,"top_k":20,"top_p":0.85,"repetition_penalty":1.1,"audio_generation_config":{"audio_parallel_decoding":false,"do_sample":true,"temperature":0.5,"top_k":5,"top_p":0.85,"repetition_penalty":1.3,"custom_params":{"sampling_rate":24000,"wave_concat_overlap":1200}}}`
+>
+> Please note that the support for sampling parameters varies according to inference frameworks(For transformers, the inference parameter configuration is located in `./generation_config.json`).
+## Deployment
+We have implemented basic adaptations in SGLang(Code is being uploaded) to support the deployment of LongCat-Next.
+```shell
+git clone [TBU]
+cd nmm_infer
+git checkout master
+sh setup.sh
+```
+```shell
+# Require CUDA >= 12.9
+# Setup environment
+source create_env.sh
+source set_env.sh
+# Run tests
+python3 demo.py \
+    --model-path meituan-longcat/LongCat-Next \
+    --sequential \
+    --output-dir output \
+    --tasks vis_gen vis_und aud_qa spk_syn
+```
+## License Agreement
+This repository, including both the model weights and the source code, is released under the **MIT License**.
+Any contributions to this repository are licensed under the MIT License, unless otherwise stated. This license does not grant any rights to use Meituan trademarks or patents.
+For details, see the [LICENSE](./LICENSE) file.
+## Usage Considerations
+This model has not been specifically designed or comprehensively evaluated for every possible downstream application.
+Developers should take into account the known limitations of large language models, including performance variations across different languages, and carefully assess accuracy, safety, and fairness before deploying the model in sensitive or high-risk scenarios.
+It is the responsibility of developers and downstream users to understand and comply with all applicable laws and regulations relevant to their use case, including but not limited to data protection, privacy, and content safety requirements.
+Nothing in this Model Card should be interpreted as altering or restricting the terms of the MIT License under which the model is released.
+<!-- ## Citation
+We kindly encourage citation of our work if you find it useful.
+```
+``` -->
+## Contact
+Please contact us at <a href="mailto:longcat-team@meituan.com">longcat-team@meituan.com</a> or open an issue if you have any questions.

assets/book.png ADDED Viewed

Git LFS Details

SHA256: 973616383a25a76a71b18532452bc3d422516c0ce684895065cbdbaeb7c654e5
Pointer size: 131 Bytes
Size of remote file: 745 kB

assets/evaluation.png ADDED Viewed

Git LFS Details

SHA256: 82bc8ab1a053e71f1328241be1739f3b9d0f0c0f84501500070c2e8a49542759
Pointer size: 131 Bytes
Size of remote file: 267 kB

assets/longcat_logo.svg ADDED Viewed

assets/math1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e88c12d17ba1b6d8a28fa6688311222673db0f958a3679347f03ba4afd4b78c2
+size 1140560

assets/overview.png ADDED Viewed

Git LFS Details

SHA256: 945a0cf9961850f0db4e81dbf7a2ac588f6206e64b85336c21fed446cf99f8cd
Pointer size: 132 Bytes
Size of remote file: 1.51 MB

assets/system_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbb21a5cd57013406e1c18e8f267d05197bbbc3fdb8a65038d9c5a7799b9357a
+size 254478

assets/vc_zh3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8313c738deac97e9c36cb861a85a896c9bbdaa22fe9f9f432feace766a75c65
+size 1282618

config.json ADDED Viewed

	@@ -0,0 +1,285 @@

+{
+  "architectures": [
+    "LongcatNextForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_longcat_next.LongcatNextConfig",
+    "AutoModel": "modeling_longcat_next.LongcatNextModel",
+    "AutoModelForCausalLM": "modeling_longcat_next.LongcatNextForCausalLM"
+  },
+  "vocab_size": 282624,
+  "hidden_size": 3072,
+  "ffn_hidden_size": 6144,
+  "expert_ffn_hidden_size": 1024,
+  "num_layers": 14,
+  "num_attention_heads": 32,
+  "kv_lora_rank": 512,
+  "q_lora_rank": 1536,
+  "qk_rope_head_dim": 64,
+  "v_head_dim": 128,
+  "qk_nope_head_dim": 128,
+  "mla_scale_q_lora": true,
+  "mla_scale_kv_lora": true,
+  "routed_scaling_factor": 6.0,
+  "n_routed_experts": 256,
+  "rms_norm_eps": 1e-5,
+  "use_cache": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "rope_theta": 10000000,
+  "max_position_embeddings": 131072,
+  "zero_expert_num": 128,
+  "zero_expert_type": "identity",
+  "moe_topk": 12,
+  "ngram_vocab_size_ratio": 78,
+  "emb_neighbor_num": 4,
+  "emb_split_num": 4,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.57.6",
+  "text_vocab_size": 131072,
+  "text_vocab_plus_multimodal_special_token_size": 131125,
+  "visual_embedding_layer_intermediate_size": 8192,
+  "visual_embedding_layer_hidden_act": "silu",
+  "visual_offset": 150581,
+  "audio_offset": 131125,
+  "visual_config": {
+    "image_start_token_id": 131106,
+    "image_end_token_id": 131107,
+    "image_pad_token_id": 131108,
+    "image_newline_token_id": 131109,
+    "_attn_implementation": "flash_attention_2",
+    "hidden_size": 1280,
+    "image_head_transformer_dims": 2048,
+    "image_head_transformer_ffn_scale": 16,
+    "image_head_transformer_layers": 4,
+    "vq_config": {
+      "codebook_dim": 3584,
+      "codebook_size": 16384,
+      "codebook_sizes": [
+        16384,
+        16384,
+        16384,
+        16384,
+        16384,
+        16384,
+        16384,
+        16384
+      ],
+      "decay": 0.99,
+      "depth": 8,
+      "commit_loss_ratio": 0.25,
+      "entropy_loss_ratio": 0,
+      "in_channels": 3584,
+      "quant_conv": true,
+      "quantizer_type": "rq",
+      "restart_unused_codes": true,
+      "shared_codebook": true,
+      "vq_loss_ratio": 0
+    },
+    "visual_decoder_config": {
+      "codebook_dim": 3584,
+      "image_decoder_config": {
+        "attention_dropout": 0.0,
+        "codebook_dim": 3584,
+        "distill_taps": [
+          3,
+          7,
+          15,
+          23
+        ],
+        "hidden_act": "gelu",
+        "hidden_size": 1024,
+        "intermediate_size": 2730,
+        "k_bias": false,
+        "layer_norm_eps": 1e-06,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 32,
+        "patch_size": 14,
+        "q_bias": true,
+        "spatial_merge_size": 2,
+        "subln": true,
+        "swiglu": true,
+        "teacher_dims": {
+          "15": 1280,
+          "23": 1280,
+          "3": 1280,
+          "7": 1280
+        },
+        "temporal_patch_size": 2,
+        "v_bias": true
+      },
+      "transformer_config": {
+        "patch_size": 2,
+        "in_channels": 16,
+        "hidden_size": 2520,
+        "num_layers": 32,
+        "num_refiner_layers": 2,
+        "num_attention_heads": 21,
+        "num_kv_heads": 7,
+        "multiple_of": 256,
+        "norm_eps": 1e-5,
+        "axes_dim_rope": [40, 40, 40],
+        "axes_lens": [10000, 10000, 10000],
+        "text_feat_dim": 2048,
+        "timestep_scale": 1000.0
+      },
+      "vae_config": {
+        "act_fn": "silu",
+        "block_out_channels": [128, 256, 512, 512],
+        "down_block_types": [
+          "DownEncoderBlock2D",
+          "DownEncoderBlock2D",
+          "DownEncoderBlock2D",
+          "DownEncoderBlock2D"
+        ],
+        "in_channels": 3,
+        "latent_channels": 16,
+        "layers_per_block": 2,
+        "mid_block_add_attention": true,
+        "norm_num_groups": 32,
+        "out_channels": 3,
+        "sample_size": 1024,
+        "scaling_factor": 0.3611,
+        "shift_factor": 0.1159,
+        "up_block_types": [
+          "UpDecoderBlock2D",
+          "UpDecoderBlock2D",
+          "UpDecoderBlock2D",
+          "UpDecoderBlock2D"
+        ],
+        "use_post_quant_conv": false,
+        "use_quant_conv": false,
+        "force_upcast": true
+      },
+      "scheduler_config": {
+        "num_train_timesteps": 1000,
+        "dynamic_time_shift": true
+      },
+      "weight_path": "WEIGHT_PATH_TO_LONGCAT_NEXT/image_decoder/image_decoder.safetensors"
+    }
+  },
+  "audio_config": {
+    "audio_head_transformer_dims": 3072,
+    "audio_head_transformer_ffn_scale": 16,
+    "audio_head_transformer_layers": 4,
+    "audio_delim_token_id": 131116,
+    "audio_end_token_id": 131104,
+    "audio_pad_token_id": 131105,
+    "audio_start_token_id": 131103,
+    "audiogen_end_token_id": 131124,
+    "audiogen_start_token_id": 131123,
+    "audiotext_end_token_id": 131121,
+    "audiotext_pad_token_id": 131122,
+    "audiotext_start_token_id": 131120,
+    "_attn_implementation": "flash_attention_2",
+    "d_model": 1280,
+    "decoder_attention_heads": 20,
+    "decoder_ffn_dim": 5120,
+    "decoder_layers": 8,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layers": 32,
+    "num_mel_bins": 128,
+    "avg_pooler": 4,
+    "decoder_kernel_size": 3,
+    "decoder_stride_size": 2,
+    "hop_length": 160,
+    "kernel_size": 3,
+    "max_audio_seconds": 30,
+    "n_fft": 400,
+    "num_hidden_layers": 32,
+    "sampling_rate": 16000,
+    "stride_size": 2,
+    "vq_config": {
+      "codebook_sizes": [
+        8192,
+        4096,
+        2048,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024
+      ]
+    },
+    "vocoder_config": {
+      "channels": [
+        256,
+        256,
+        256,
+        256,
+        256
+      ],
+      "hop_length": 256,
+      "num_mel_bins": 80,
+      "sampling_rate": 16000
+    },
+    "flow_matching_config": {
+      "in_channels": 80,
+      "spk_emb_dim": 0,
+      "diffusion_steps": 10,
+      "cal_mel_mae": true,
+      "prenet_activation_function": "gelu",
+      "prenet_attention_heads": 8,
+      "prenet_d_model": 512,
+      "prenet_ffn_dim": 2048,
+      "prenet_in_dim": 1280,
+      "prenet_max_source_positions": 5000,
+      "prenet_nlayers": 12,
+      "prenet_out_dim": 80,
+      "prenet_target_mel_length_scale_ratio": 1.0,
+      "channels": [
+        256
+      ],
+      "dropout": 0.0,
+      "attention_head_dim": 64,
+      "n_blocks": 4,
+      "num_heads": 8,
+      "num_mid_blocks": 12,
+      "act_fn": "gelu",
+      "cfm_params": {
+        "inference_cfg_rate": 0.7,
+        "sigma_min": 1e-06,
+        "solver": "euler",
+        "t_scheduler": "cosine",
+        "training_cfg_rate": 0.2
+      },
+      "use_hidden_states_before_dconv2": true
+    },
+    "cosy24kvocoder_config": {
+      "weight_path": "WEIGHT_PATH_TO_LONGCAT_NEXT/cosy24k_vocoder/hift.pt"
+    }
+  }
+}

configuration_longcat_next.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from transformers.models.whisper.configuration_whisper import WhisperConfig
+from .configuration_longcat_ngram import LongcatFlashNgramConfig
+class LongcatNextConfig(LongcatFlashNgramConfig):
+    def __init__(
+        self,
+        vocab_size=131072,
+        hidden_size=6144,
+        num_hidden_layers=56,
+        num_layers=28,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        ffn_hidden_size=12288,
+        q_lora_rank=1536,
+        kv_lora_rank=512,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        head_dim=64,
+        v_head_dim=128,
+        qk_head_dim=None,
+        moe_topk=12,
+        n_routed_experts=512,
+        zero_expert_num=256,
+        expert_ffn_hidden_size=2048,
+        routed_scaling_factor=6.0,
+        emb_neighbor_num=None,
+        emb_split_num=None,
+        ngram_vocab_size_ratio=None,
+        oe_ignored_token_ids=[],
+        text_vocab_size=131072, # text vocab size (vocab_size = text_vocab_size + audio_token + visual_token + multimodal_special_token_list)
+        text_vocab_plus_multimodal_special_token_size=131125,
+        visual_embedding_layer_intermediate_size=8192,
+        visual_embedding_layer_hidden_act="silu",
+        visual_offset=150581,
+        audio_offset=131125,
+        visual_config={},
+        audio_config={},
+        **kwargs,
+    ):
+        self.text_vocab_size = text_vocab_size
+        self.text_vocab_plus_multimodal_special_token_size = text_vocab_plus_multimodal_special_token_size
+        self.visual_embedding_layer_intermediate_size = visual_embedding_layer_intermediate_size
+        self.visual_embedding_layer_hidden_act = visual_embedding_layer_hidden_act
+        self.visual_offset = visual_offset
+        self.audio_offset = audio_offset
+        self.visual_config = LongcatNextVisualConfig(**visual_config)
+        self.audio_config = LongcatNextAudioConfig(**audio_config)
+        oe_ignored_token_ids = oe_ignored_token_ids or list(range(self.text_vocab_size, self.text_vocab_plus_multimodal_special_token_size))
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_layers=num_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            use_cache=use_cache,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            ffn_hidden_size=ffn_hidden_size,
+            q_lora_rank=q_lora_rank,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            head_dim=head_dim,
+            v_head_dim=v_head_dim,
+            qk_head_dim=qk_head_dim,
+            moe_topk=moe_topk,
+            n_routed_experts=n_routed_experts,
+            zero_expert_num=zero_expert_num,
+            expert_ffn_hidden_size=expert_ffn_hidden_size,
+            routed_scaling_factor=routed_scaling_factor,
+            emb_neighbor_num=emb_neighbor_num,
+            emb_split_num=emb_split_num,
+            ngram_vocab_size_ratio=ngram_vocab_size_ratio,
+            oe_ignored_token_ids=oe_ignored_token_ids,
+            **kwargs,
+        )
+class LongcatNextVisualConfig(Qwen2_5_VLVisionConfig):
+    model_type = "longcat_next_visual"
+    base_config_key = ""
+    def __init__(
+        self,
+        image_start_token_id=131106,
+        image_end_token_id=131107,
+        image_pad_token_id=131108,
+        image_newline_token_id=131109,
+        vq_config={},
+        visual_decoder_config={},
+        **kwargs,
+    ):
+        self.image_start_token_id = image_start_token_id
+        self.image_end_token_id = image_end_token_id
+        self.image_pad_token_id = image_pad_token_id
+        self.image_newline_token_id = image_newline_token_id
+        self.vq_config = PretrainedConfig(**vq_config)
+        self.visual_decoder_config = PretrainedConfig(**visual_decoder_config)
+        self.visual_decoder_config.image_decoder_config = PretrainedConfig(**getattr(self.visual_decoder_config, "image_decoder_config", {}))
+        self.visual_decoder_config.transformer_config = PretrainedConfig(**getattr(self.visual_decoder_config, "transformer_config", {}))
+        self.visual_decoder_config.vae_config = PretrainedConfig(**getattr(self.visual_decoder_config, "vae_config", {}))
+        self.visual_decoder_config.scheduler_config = PretrainedConfig(**getattr(self.visual_decoder_config, "scheduler_config", {}))
+        super().__init__(**kwargs)
+class LongcatNextAudioConfig(WhisperConfig):
+    model_type = "longcat_next_audio"
+    base_config_key = ""
+    def __init__(
+        self,
+        vq_config={},
+        vocoder_config={},
+        flow_matching_config={},
+        cosy24kvocoder_config={},
+        **kwargs
+    ):
+        self.vq_config = PretrainedConfig(**vq_config)
+        self.vocoder_config = PretrainedConfig(**vocoder_config)
+        self.flow_matching_config = PretrainedConfig(**flow_matching_config)
+        self.flow_matching_config.cfm_params = PretrainedConfig(**getattr(self.flow_matching_config, "cfm_params", {}))
+        self.cosy24kvocoder_config = PretrainedConfig(**cosy24kvocoder_config)
+        super().__init__(**kwargs)
+__all__ = ["LongcatNextConfig", "LongcatNextVisualConfig", "LongcatNextAudioConfig"]

configuration_longcat_ngram.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from transformers.models.longcat_flash import LongcatFlashConfig
+class LongcatFlashNgramConfig(LongcatFlashConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LongcatFlashNgramModel`]. It is used to instantiate
+    a LongCat Flash model with N-gram enhanced embeddings according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the LongCat Flash model. Defines the number of different tokens that can be represented by the
+            `input_ids` passed when calling [`LongcatFlashNgramModel`]
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 56):
+            Number of hidden layers in the Transformer decoder.
+        num_layers (`int`, *optional*, defaults to 28):
+            Number of layers, each with 2 sublayers.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting from a multi-head checkpoint to a GQA checkpoint, each group key and value head should be
+            constructed by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used by the RMS normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        rope_theta (`float`, *optional*, defaults to 10000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        ffn_hidden_size (`int`, *optional*, defaults to 12288):
+            Dimension of the MLP representations.
+        q_lora_rank (`int`, *optional*, defaults to 1536):
+            The rank of the query LoRA projection in MLA (Multi-head Latent Attention).
+        kv_lora_rank (`int`, *optional*, defaults to 512):
+            The rank of the key-value LoRA projection in MLA.
+        qk_nope_head_dim (`int`, *optional*, defaults to 128):
+            The dimension of the non-position encoding part of query/key heads.
+        qk_rope_head_dim (`int`, *optional*, defaults to 64):
+            The dimension of the RoPE part of query/key heads.
+        head_dim (`int`, *optional*, defaults to 64):
+            Standard dimension of qk heads, unused except for CI.
+        v_head_dim (`int`, *optional*, defaults to 128):
+            The dimension of value heads.
+        qk_head_dim (`int`, *optional*):
+            The total dimension of query/key heads. If not specified, set to `qk_nope_head_dim + qk_rope_head_dim`.
+        moe_topk (`int`, *optional*, defaults to 12):
+            Number of experts to route to for each token in the MoE layer.
+        n_routed_experts (`int`, *optional*, defaults to 512):
+            Number of routed experts in the MoE layer.
+        zero_expert_num (`int`, *optional*, defaults to 256):
+            Number of zero experts (identity function) to add to the expert pool.
+        expert_ffn_hidden_size (`int`, *optional*, defaults to 2048):
+            Hidden size of individual expert FFN layers.
+        routed_scaling_factor (`float`, *optional*, defaults to 6.0):
+            Scaling factor applied to the routing weights.
+        emb_neighbor_num (`int`, *optional*):
+            Maximum N-gram length for N-gram embeddings. This parameter determines the context window size for N-gram computation. Higher values capture
+            longer-range lexical patterns but increase memory usage.
+        emb_split_num (`int`, *optional*):
+            Number of hash functions (or splits) to use for N-gram embeddings. Multiple hash functions help improve the quality of N-gram representations.
+        ngram_vocab_size_ratio (`float`, *optional*):
+            Ratio multiplier for N-gram vocabulary size relative to the base vocabulary size. The N-gram vocabulary
+            size is calculated as `vocab_size * ngram_vocab_size_ratio`.
+    Example:
+    ```python
+    >>> from transformers import LongcatFlashNgramModel, LongcatFlashNgramConfig
+    >>> # Initializing a LongCat Flash N-gram style configuration
+    >>> configuration = LongcatFlashNgramConfig(
+    ...     emb_neighbor_num=3,
+    ...     emb_split_num=4,
+    ...     ngram_vocab_size_ratio=1.5
+    ... )
+    >>> # Initializing a model from the configuration
+    >>> model = LongcatFlashNgramModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "longcat_flash_ngram"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.*.q_b_proj": "colwise",
+        "layers.*.self_attn.*.kv_b_proj": "colwise",
+        "layers.*.self_attn.*.o_proj": "rowwise",
+        "layers.*.mlps.*.gate_proj": "colwise",
+        "layers.*.mlps.*.up_proj": "colwise",
+        "layers.*.mlps.*.down_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "colwise",
+        "layers.*.mlp.experts.*.up_proj": "colwise",
+        "layers.*.mlp.experts.*.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=131072,
+        hidden_size=6144,
+        num_hidden_layers=56,
+        num_layers=28,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        ffn_hidden_size=12288,
+        q_lora_rank=1536,
+        kv_lora_rank=512,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        head_dim=64,
+        v_head_dim=128,
+        qk_head_dim=None,
+        moe_topk=12,
+        n_routed_experts=512,
+        zero_expert_num=256,
+        expert_ffn_hidden_size=2048,
+        routed_scaling_factor=6.0,
+        emb_neighbor_num=None,
+        emb_split_num=None,
+        ngram_vocab_size_ratio=None,
+        oe_ignored_token_ids=[],
+        **kwargs,
+    ):
+        # N-gram embedding specific parameters
+        self.emb_neighbor_num = emb_neighbor_num
+        self.emb_split_num = emb_split_num
+        self.ngram_vocab_size_ratio = ngram_vocab_size_ratio
+        self.oe_ignored_token_ids = oe_ignored_token_ids
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_layers=num_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            use_cache=use_cache,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            attention_bias=attention_bias,
+            attention_dropout=attention_dropout,
+            ffn_hidden_size=ffn_hidden_size,
+            q_lora_rank=q_lora_rank,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            head_dim=head_dim,
+            v_head_dim=v_head_dim,
+            qk_head_dim=qk_head_dim,
+            moe_topk=moe_topk,
+            n_routed_experts=n_routed_experts,
+            zero_expert_num=zero_expert_num,
+            expert_ffn_hidden_size=expert_ffn_hidden_size,
+            routed_scaling_factor=routed_scaling_factor,
+            **kwargs,
+        )
+__all__ = ["LongcatFlashNgramConfig"]

cosy24k_vocoder.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HIFI-GAN"""
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import weight_norm
+from torch.distributions.uniform import Uniform
+from torch.nn import Parameter
+from torch import nn, sin, pow
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+"""hifigan based generator implementation.
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+"""
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = f0_predictor
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(
+            self,
+            batch: dict,
+            # device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2) # .to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))
+class Cosy24kVocoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.hifigan_generator = HiFTGenerator(
+            in_channels=80,
+            base_channels=512,
+            nb_harmonics=8,
+            sampling_rate=24000,
+            nsf_alpha=0.1,
+            nsf_sigma=0.003,
+            nsf_voiced_threshold=10,
+            upsample_rates=[8, 5, 3],
+            upsample_kernel_sizes=[16, 11, 7],
+            resblock_kernel_sizes=[3, 7, 11],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes=[7, 7, 11],
+            source_resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            lrelu_slope=0.1,
+            audio_limit=0.99,
+            f0_predictor=ConvRNNF0Predictor(
+                num_class=1,
+                in_channels=80,
+                cond_channels=512,
+            ),
+        )
+    def decode(self, mel, device="cuda"):
+        """
+        Args: mel: (batch_size, n_frames, n_mel)
+        """
+        generated_speech, f0 = self.hifigan_generator.forward(
+                {"speech_feat": mel.transpose(1, 2)}, # device=device
+            )
+        return generated_speech
+    @classmethod
+    def from_pretrained(cls, model_path: str):
+        """Load a pretrained model from a checkpoint."""
+        model = cls()
+        model.hifigan_generator.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')), strict=True)
+        model.eval()
+        return model

environment.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: longcat_next
+dependencies:
+  - python=3.10
+  - ffmpeg<7
+  - pip
+  - pip:
+    - soundfile==0.13.1

generation_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "pad_token_id": 3,
+    "max_new_tokens": 2048,
+    "do_sample": true,
+    "temperature": 0.4,
+    "top_k": 20,
+    "top_p": 0.85,
+    "repetition_penalty": 1.1,
+    "visual_generation_config": {
+        "do_sample": true,
+        "temperature": 0.5,
+        "top_p": 0.75,
+        "top_k": 1024,
+        "custom_params": {
+            "cfg_scale": 3.0,
+            "token_h": 37,
+            "token_w": 37,
+            "anyres_prefix": "<longcat_img_token_size>{h} {w}</longcat_img_token_size>"
+        }
+    },
+    "audio_generation_config": {
+        "audio_parallel_decoding": false,
+        "do_sample": true,
+        "temperature": 0.5,
+        "top_k": 5,
+        "top_p": 0.85,
+        "repetition_penalty": 1.3,
+        "custom_params": {
+            "sampling_rate": 24000,
+            "wave_concat_overlap": 1200
+        }
+    },
+    "transformers_version": "4.57.6"
+}

image_refiner.py ADDED Viewed

	@@ -0,0 +1,748 @@

+"""Image refiner: refiner pipeline, refiner container, and utilities.
+Contains:
+- RefinerImageProcessor: Image pre/post-processing for the diffusion pipeline
+- RefinerPipeline: DiffusionPipeline for image refinement
+- ImageRefinerContainer: nn.Module container for refiner sub-modules
+- IdentityWithArgs: Placeholder module for cond_proj
+- de_transform / tensor2pil: Tensor-to-PIL conversion utilities
+"""
+import inspect
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+from safetensors.torch import load_file
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor, is_valid_image_imagelist
+from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
+from .refiner_modules import FlowMatchEulerDiscreteScheduler
+from .refiner_modules import Transformer2DModel, RotaryPosEmbed
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _clean_config_dict(cfg, cls=None) -> dict:
+    """Convert a PretrainedConfig to a clean dict for model construction.
+    If ``cls`` is provided, only keeps keys that match the cls.__init__ params
+    (allowlist approach). Otherwise falls back to blocklist filtering.
+    """
+    if hasattr(cfg, "to_dict"):
+        d = cfg.to_dict()
+    elif isinstance(cfg, dict):
+        d = dict(cfg)
+    else:
+        d = {k: v for k, v in vars(cfg).items()}
+    if cls is not None:
+        import inspect
+        sig = inspect.signature(cls.__init__)
+        valid_keys = set(sig.parameters.keys()) - {"self"}
+        if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()):
+            # Has **kwargs — can't filter by allowlist, fall through to blocklist
+            pass
+        else:
+            return {k: v for k, v in d.items() if k in valid_keys}
+    # Blocklist: remove HuggingFace PretrainedConfig metadata
+    _PRETRAINED_CONFIG_KEYS = {
+        "_name_or_path", "transformers_version", "model_type", "_commit_hash",
+        "_attn_implementation", "_attn_implementation_autoset", "return_dict",
+        "output_hidden_states", "output_attentions", "use_bfloat16",
+        "torchscript", "torch_dtype", "is_encoder_decoder", "is_decoder",
+        "add_cross_attention", "tie_encoder_decoder", "tie_word_embeddings",
+        "cross_attention_hidden_size", "chunk_size_feed_forward", "decoder_start_token_id",
+        "architectures", "finetuning_task", "id2label", "label2id", "prefix",
+        "problem_type", "tokenizer_class", "task_specific_params", "pruned_heads",
+        "bos_token_id", "eos_token_id", "pad_token_id", "sep_token_id",
+        "max_length", "min_length", "do_sample", "early_stopping",
+        "num_beams", "num_beam_groups", "diversity_penalty", "temperature",
+        "top_k", "top_p", "typical_p", "repetition_penalty", "length_penalty",
+        "no_repeat_ngram_size", "encoder_no_repeat_ngram_size", "bad_words_ids",
+        "num_return_sequences", "output_scores", "return_dict_in_generate",
+        "forced_bos_token_id", "forced_eos_token_id", "remove_invalid_values",
+        "exponential_decay_length_penalty", "suppress_tokens", "begin_suppress_tokens",
+        "tf_legacy_loss", "dtype",
+    }
+    return {k: v for k, v in d.items() if not k.startswith("_") and k not in _PRETRAINED_CONFIG_KEYS}
+# ---------------------------------------------------------------------------
+# Image Refiner Container (nn.Module for state_dict loading)
+# ---------------------------------------------------------------------------
+class ImageRefinerContainer(nn.Module):
+    """Container for refiner components.
+    Holds base_transformer, vae, cond_proj as nn.Module children so their
+    parameters appear in the parent model's state_dict and are loaded
+    automatically via from_pretrained.
+    """
+    def __init__(self, visual_decoder_config):
+        super().__init__()
+        tc = visual_decoder_config.transformer_config
+        vc = visual_decoder_config.vae_config
+        self.base_transformer = Transformer2DModel(**_clean_config_dict(tc))
+        self.vae = AutoencoderKL(**_clean_config_dict(vc))
+        self.vae.requires_grad_(False)
+        text_feat_dim = getattr(tc, "text_feat_dim", 3584)
+        codebook_dim = getattr(visual_decoder_config, "codebook_dim", text_feat_dim)
+        if codebook_dim != text_feat_dim:
+            self.cond_proj = nn.Linear(codebook_dim, text_feat_dim)
+        else:
+            self.cond_proj = IdentityWithArgs()
+    @classmethod
+    def from_pretrained(cls, config, model_path: str):
+        model = cls(config)
+        weight_dict = load_file(model_path, device="cpu")
+        model.load_state_dict({k.removeprefix("image_refiner."): v for k, v in weight_dict.items() if k.startswith("image_refiner.")}, strict=True)
+        model.eval()
+        return model
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+class RefinerImageProcessor(VaeImageProcessor):
+    """Image processor for refiner - extends diffusers' VaeImageProcessor."""
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 16,
+        resample: str = "lanczos",
+        max_pixels: Optional[int] = None,
+        max_side_length: Optional[int] = None,
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_grayscale: bool = False,
+    ):
+        super().__init__(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            resample=resample,
+            do_normalize=do_normalize,
+            do_binarize=do_binarize,
+            do_convert_grayscale=do_convert_grayscale,
+        )
+        self.max_pixels = max_pixels
+        self.max_side_length = max_side_length
+    def get_new_height_width(
+        self,
+        image: Union["PIL.Image.Image", np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        max_side_length: Optional[int] = None,
+    ) -> Tuple[int, int]:
+        import PIL.Image
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+            else:
+                height = image.shape[1]
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+            else:
+                width = image.shape[2]
+        if max_side_length is None:
+            max_side_length = self.max_side_length
+        if max_pixels is None:
+            max_pixels = self.max_pixels
+        ratio = 1.0
+        if max_side_length is not None:
+            max_side_length_ratio = max_side_length / max(height, width)
+        else:
+            max_side_length_ratio = 1.0
+        cur_pixels = height * width
+        max_pixels_ratio = (max_pixels / cur_pixels) ** 0.5 if max_pixels is not None else 1.0
+        ratio = min(max_pixels_ratio, max_side_length_ratio, 1.0)
+        sf = self.config.vae_scale_factor
+        new_height = int(height * ratio) // sf * sf
+        new_width = int(width * ratio) // sf * sf
+        return new_height, new_width
+    def preprocess(
+        self,
+        image: PipelineImageInput,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        max_side_length: Optional[int] = None,
+        resize_mode: str = "default",
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> torch.Tensor:
+        import PIL.Image
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+        if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+            if isinstance(image, torch.Tensor):
+                image = image.unsqueeze(1)
+            else:
+                if image.shape[-1] == 1:
+                    image = np.expand_dims(image, axis=0)
+                else:
+                    image = np.expand_dims(image, axis=-1)
+        if isinstance(image, list) and isinstance(image[0], np.ndarray) and image[0].ndim == 4:
+            warnings.warn(
+                "Passing `image` as a list of 4d np.ndarray is deprecated. "
+                "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray",
+                FutureWarning,
+            )
+            image = np.concatenate(image, axis=0)
+        if isinstance(image, list) and isinstance(image[0], torch.Tensor) and image[0].ndim == 4:
+            warnings.warn(
+                "Passing `image` as a list of 4d torch.Tensor is deprecated. "
+                "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor",
+                FutureWarning,
+            )
+            image = torch.cat(image, axis=0)
+        if not is_valid_image_imagelist(image):
+            raise ValueError(
+                f"Input is in incorrect format. Currently, we only support "
+                f"{', '.join(str(x) for x in supported_formats)}"
+            )
+        if not isinstance(image, list):
+            image = [image]
+        if isinstance(image[0], PIL.Image.Image):
+            if crops_coords is not None:
+                image = [i.crop(crops_coords) for i in image]
+            if self.config.do_resize:
+                height, width = self.get_new_height_width(image[0], height, width, max_pixels, max_side_length)
+                image = [self.resize(i, height, width, resize_mode=resize_mode) for i in image]
+            if self.config.do_convert_grayscale:
+                image = [self.convert_to_grayscale(i) for i in image]
+            image = self.pil_to_numpy(image)
+            image = self.numpy_to_pt(image)
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = self.numpy_to_pt(image)
+            height, width = self.get_new_height_width(image, height, width, max_pixels, max_side_length)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+            if self.config.do_convert_grayscale and image.ndim == 3:
+                image = image.unsqueeze(1)
+            channel = image.shape[1]
+            if channel == self.config.vae_latent_channels:
+                return image
+            height, width = self.get_new_height_width(image, height, width, max_pixels, max_side_length)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+        do_normalize = self.config.do_normalize
+        if do_normalize and image.min() < 0:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. "
+                f"The expected value range for image tensor is [0,1] when passing as pytorch tensor or numpy Array. "
+                f"You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            image = self.normalize(image)
+        if self.config.do_binarize:
+            image = self.binarize(image)
+        return image
+@dataclass
+class RefinerOutput:
+    images: Union[List[Image.Image], torch.Tensor]
+class IdentityWithArgs(nn.Module):
+    """Placeholder Identity module for cond_proj."""
+    def __init__(self, dtype=torch.float32, device=None):
+        super().__init__()
+        self.register_buffer("_dummy", torch.zeros((), dtype=dtype, device=device))
+    @property
+    def dtype(self):
+        return self._dummy.dtype
+    @property
+    def device(self):
+        return self._dummy.device
+    def forward(self, x, *args, **kwargs):
+        return x
+def _retrieve_timesteps(
+    scheduler: FlowMatchEulerDiscreteScheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    # If scheduler uses dynamic shifting and caller passed num_tokens, compute mu
+    # (same as training code refiner pipeline)
+    num_tokens = kwargs.pop("num_tokens", None)
+    if num_tokens is not None and getattr(scheduler.config, "use_dynamic_shifting", False):
+        # Compute mu from num_tokens using scheduler's linear interpolation
+        base_shift = getattr(scheduler.config, "base_shift", 0.5)
+        max_shift = getattr(scheduler.config, "max_shift", 1.15)
+        base_seq_len = getattr(scheduler.config, "base_image_seq_len", 256)
+        max_seq_len = getattr(scheduler.config, "max_image_seq_len", 4096)
+        m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+        b = base_shift - m * base_seq_len
+        mu = num_tokens * m + b
+        kwargs["mu"] = mu
+    accepted = set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+    filtered_kwargs = {k: v for k, v in kwargs.items() if k in accepted}
+    if timesteps is not None:
+        if "timesteps" not in accepted:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **filtered_kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **filtered_kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class RefinerPipeline(DiffusionPipeline):
+    """
+    Image refiner evaluation pipeline.
+    - cond comes from upstream model: encoder_hidden_states (quants / last_latent)
+    - grid_thw_list is used to split cond (consistent with training)
+    - image as ref image
+    - Supports FlowMatchEulerDiscreteScheduler + velocity model
+    """
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        transformer: Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        cond_proj: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+            cond_proj=cond_proj if cond_proj is not None else IdentityWithArgs(),
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1)
+            if hasattr(self.vae.config, "block_out_channels")
+            else 8
+        )
+        self.image_processor = RefinerImageProcessor(
+            vae_scale_factor=self.vae_scale_factor * 2, do_resize=True
+        )
+        self.patch_size = int(getattr(self.transformer.config, "patch_size", 16))
+        self._num_timesteps: int = 0
+        self._current_timestep: Optional[torch.Tensor] = None
+        self._interrupt: bool = False
+        self._freqs_cis: Optional[torch.Tensor] = None
+        self._text_guidance_scale: float = 1.0
+        self._image_guidance_scale: float = 1.0
+        self._cfg_range: Tuple[float, float] = (0.0, 1.0)
+    @torch.no_grad()
+    def _get_freqs_cis(self, device, dtype):
+        if self._freqs_cis is None:
+            self._freqs_cis = RotaryPosEmbed.get_freqs_cis(
+                self.transformer.config.axes_dim_rope,
+                self.transformer.config.axes_lens,
+                theta=10000,
+            )
+        return self._freqs_cis
+    @staticmethod
+    def _split_tokens(
+        encoder_hidden_states: torch.Tensor,
+        grid_thw_list: List[Tuple[int, int, int]],
+    ) -> List[torch.Tensor]:
+        splits = [int(h) * int(w) // 4 for (_, h, w) in grid_thw_list]
+        return list(torch.split(encoder_hidden_states, splits, dim=1))
+    @staticmethod
+    def _looks_like_latents(x: Union[torch.Tensor, Image.Image], latent_ch_hint: int = 16) -> bool:
+        if not isinstance(x, torch.Tensor):
+            return False
+        if x.ndim not in (3, 4):
+            return False
+        c = int(x.shape[-3])
+        if c == 3:
+            return False
+        if c == latent_ch_hint:
+            return True
+        if c > 3 and c <= 32:
+            return True
+        return False
+    @torch.no_grad()
+    def _preprocess_to_vae_range(self, img: torch.Tensor) -> torch.Tensor:
+        if img.dtype not in (torch.float32, torch.float16, torch.bfloat16):
+            img = img.float()
+        if img.max() > 1.5:
+            img = img / 255.0
+        if img.min() >= 0.0 and img.max() <= 1.0:
+            img = img * 2.0 - 1.0
+        return img.clamp(-1, 1)
+    @torch.no_grad()
+    def _encode_image_to_latents(
+        self,
+        img_any: Union[Image.Image, torch.Tensor],
+        device,
+        dtype,
+    ) -> Tuple[torch.Tensor, int, int]:
+        latent_ch_hint = int(getattr(getattr(self.vae, "config", None), "latent_channels", 16))
+        if self._looks_like_latents(img_any, latent_ch_hint=latent_ch_hint):
+            z = img_any
+            if z.ndim == 3:
+                z = z.unsqueeze(0)
+            z = z.to(device=device, dtype=dtype)
+            H_lat, W_lat = z.shape[-2], z.shape[-1]
+            return z, H_lat, W_lat
+        if isinstance(img_any, Image.Image):
+            img = torch.from_numpy(
+                np.array(img_any).astype("float32") / 255.0
+            ).permute(2, 0, 1).unsqueeze(0)
+        elif isinstance(img_any, torch.Tensor):
+            img = img_any
+            if img.ndim == 3:
+                img = img.unsqueeze(0)
+        else:
+            raise TypeError("Unsupported image type. Use PIL.Image or torch.Tensor or latent Tensor.")
+        img = self._preprocess_to_vae_range(img)
+        H, W = img.shape[-2:]
+        base = self.patch_size * self.vae_scale_factor
+        target_H = max(base, math.ceil(H / base) * base)
+        target_W = max(base, math.ceil(W / base) * base)
+        if (H != target_H) or (W != target_W):
+            img = F.interpolate(img, size=(target_H, target_W), mode="bilinear", align_corners=False)
+        img = img.to(device=device, dtype=self.vae.dtype)
+        posterior = self.vae.encode(img).latent_dist
+        z0 = posterior.sample()
+        if getattr(self.vae.config, "shift_factor", None) is not None:
+            z0 = z0 - self.vae.config.shift_factor
+        if getattr(self.vae.config, "scaling_factor", None) is not None:
+            z0 = z0 * self.vae.config.scaling_factor
+        z0 = z0.to(device=device, dtype=dtype)
+        H_lat, W_lat = z0.shape[-2], z0.shape[-1]
+        return z0, H_lat, W_lat
+    @staticmethod
+    def _expand_to_list(x, n):
+        if x is None:
+            return [None] * n
+        if isinstance(x, (Image.Image, torch.Tensor)):
+            return [x] * n
+        assert isinstance(x, list), "`image` must be PIL / Tensor or list of them."
+        assert len(x) == n, "`len(image)` must equal number of image chunks"
+        return x
+    @torch.no_grad()
+    def _denoise_once(
+        self,
+        cond_tokens: torch.Tensor,
+        ref_img: Optional[Union[Image.Image, torch.Tensor]],
+        num_inference_steps: int = 28,
+        timesteps: Optional[List[int]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "pil",
+        text_guidance_scale: float = 1.0,
+        image_guidance_scale: float = 1.0,
+        cfg_range: Tuple[float, float] = (0.0, 1.0),
+        enable_processor_bar: bool = True,
+    ):
+        device = cond_tokens.device
+        weight_dtype = self.transformer.dtype
+        self._text_guidance_scale = text_guidance_scale
+        self._image_guidance_scale = image_guidance_scale
+        self._cfg_range = cfg_range
+        cond_tokens = cond_tokens.to(device=device, dtype=weight_dtype)
+        text_feats = self.cond_proj(cond_tokens)
+        B, L, _ = text_feats.shape
+        text_mask = torch.ones(B, L, device=device, dtype=torch.bool)
+        ref_image_hidden_states = None
+        H_lat: int
+        W_lat: int
+        if ref_img is not None:
+            if isinstance(ref_img, torch.Tensor) and ref_img.ndim == 4 and ref_img.shape[0] == B:
+                z_ref, H_lat, W_lat = self._encode_image_to_latents(ref_img, device=device, dtype=weight_dtype)
+                ref_image_hidden_states = [[z_ref[b]] for b in range(B)]
+            else:
+                z_ref, H_lat, W_lat = self._encode_image_to_latents(ref_img, device=device, dtype=weight_dtype)
+                z_single = z_ref[0]
+                ref_image_hidden_states = [[z_single] for _ in range(B)]
+        else:
+            H_lat = W_lat = 128 // self.vae_scale_factor
+        C_lat = getattr(self.transformer.config, "in_channels", None)
+        if C_lat is None:
+            if ref_image_hidden_states is not None:
+                C_lat = ref_image_hidden_states[0][0].shape[0]
+            else:
+                raise ValueError("transformer.config.in_channels is None and no ref_img was provided.")
+        latents_shape = (B, C_lat, H_lat, W_lat)
+        if isinstance(generator, list):
+            if len(generator) != B:
+                raise ValueError(
+                    f"len(generator)={len(generator)} must equal B={B} when passing list of generators."
+                )
+            latents = torch.stack(
+                [
+                    torch.randn(
+                        (1, C_lat, H_lat, W_lat),
+                        generator=generator[i],
+                        device=device,
+                        dtype=weight_dtype,
+                    ).squeeze(0)
+                    for i in range(B)
+                ],
+                dim=0,
+            )
+        else:
+            latents = torch.randn(latents_shape, generator=generator, device=device, dtype=weight_dtype)
+        num_tokens = H_lat * W_lat
+        timesteps_sched, num_inference_steps = _retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps=num_inference_steps,
+            device=device,
+            timesteps=timesteps,
+            num_tokens=num_tokens,
+        )
+        num_warmup_steps = max(len(timesteps_sched) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps_sched)
+        freqs_cis = self._get_freqs_cis(device=device, dtype=weight_dtype)
+        progress_bar = self.progress_bar(total=num_inference_steps) if enable_processor_bar else None
+        for i, t in enumerate(timesteps_sched):
+            if self._interrupt:
+                continue
+            self._current_timestep = t
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            step_frac = i / max(len(timesteps_sched) - 1, 1)
+            use_cfg = (cfg_range[0] <= step_frac <= cfg_range[1]) and (
+                text_guidance_scale > 1.0 or image_guidance_scale > 1.0
+            )
+            if not use_cfg:
+                optional_kwargs: Dict[str, Any] = {}
+                if "ref_image_hidden_states" in inspect.signature(self.transformer.forward).parameters:
+                    optional_kwargs["ref_image_hidden_states"] = ref_image_hidden_states
+                model_pred = self.transformer(
+                    latents, timestep, text_feats, freqs_cis, text_mask, **optional_kwargs
+                )
+            else:
+                text_uncond = torch.zeros_like(text_feats)
+                opt_kwargs_text: Dict[str, Any] = {}
+                if "ref_image_hidden_states" in inspect.signature(self.transformer.forward).parameters:
+                    opt_kwargs_text["ref_image_hidden_states"] = ref_image_hidden_states
+                model_pred_text = self.transformer(
+                    latents, timestep, text_feats, freqs_cis, text_mask, **opt_kwargs_text
+                )
+                opt_kwargs_ref: Dict[str, Any] = {}
+                if "ref_image_hidden_states" in inspect.signature(self.transformer.forward).parameters:
+                    opt_kwargs_ref["ref_image_hidden_states"] = ref_image_hidden_states
+                model_pred_ref = self.transformer(
+                    latents, timestep, text_uncond, freqs_cis, text_mask, **opt_kwargs_ref
+                )
+                opt_kwargs_uncond: Dict[str, Any] = {}
+                if "ref_image_hidden_states" in inspect.signature(self.transformer.forward).parameters:
+                    opt_kwargs_uncond["ref_image_hidden_states"] = None
+                model_pred_uncond = self.transformer(
+                    latents, timestep, text_uncond, freqs_cis, text_mask, **opt_kwargs_uncond
+                )
+                if text_guidance_scale > 1.0 and image_guidance_scale > 1.0:
+                    model_pred = (
+                        model_pred_uncond
+                        + image_guidance_scale * (model_pred_ref - model_pred_uncond)
+                        + text_guidance_scale * (model_pred_text - model_pred_ref)
+                    )
+                elif text_guidance_scale > 1.0:
+                    model_pred = model_pred_uncond + text_guidance_scale * (model_pred_text - model_pred_uncond)
+                elif image_guidance_scale > 1.0:
+                    model_pred = model_pred_uncond + image_guidance_scale * (model_pred_ref - model_pred_uncond)
+                else:
+                    model_pred = model_pred_text
+            latents = self.scheduler.step(model_pred, t, latents, return_dict=False)[0]
+            latents = latents.to(dtype=weight_dtype)
+            if progress_bar is not None:
+                if i == len(timesteps_sched) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+        if progress_bar is not None:
+            progress_bar.close()
+        self._current_timestep = None
+        latents = latents.to(dtype=self.vae.dtype)
+        if getattr(self.vae.config, "scaling_factor", None) is not None:
+            latents = latents / self.vae.config.scaling_factor
+        if getattr(self.vae.config, "shift_factor", None) is not None:
+            latents = latents + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        images = self.image_processor.postprocess(image, output_type=output_type)
+        return images
+    @torch.no_grad()
+    def __call__(
+        self,
+        *,
+        encoder_hidden_states: torch.Tensor,
+        grid_thw_list: List[Tuple[int, int, int]],
+        image: Union[Image.Image, torch.Tensor, List[Union[Image.Image, torch.Tensor]], None] = None,
+        num_inference_steps: int = 28,
+        timesteps: Optional[List[int]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        text_guidance_scale: float = 1.5,
+        image_guidance_scale: float = 1.5,
+        cfg_range: Tuple[float, float] = (0.0, 1.0),
+        enable_processor_bar: bool = True,
+        **kwargs,
+    ) -> Union[RefinerOutput, List[Image.Image], torch.Tensor]:
+        self._interrupt = False
+        token_chunks = self._split_tokens(encoder_hidden_states, grid_thw_list)
+        ref_list = self._expand_to_list(image, len(token_chunks))
+        results_pil: List[Image.Image] = []
+        results_pt: Optional[torch.Tensor] = None
+        for tok, _, img_any in zip(token_chunks, grid_thw_list, ref_list):
+            imgs = self._denoise_once(
+                cond_tokens=tok,
+                ref_img=img_any,
+                num_inference_steps=num_inference_steps,
+                timesteps=timesteps,
+                generator=generator,
+                output_type=output_type,
+                text_guidance_scale=text_guidance_scale,
+                image_guidance_scale=image_guidance_scale,
+                cfg_range=cfg_range,
+                enable_processor_bar=enable_processor_bar,
+            )
+            if output_type == "pil":
+                results_pil += imgs
+            else:
+                results_pt = imgs if results_pt is None else torch.cat([results_pt, imgs], dim=0)
+        if not return_dict:
+            return results_pil if output_type == "pil" else results_pt
+        return RefinerOutput(images=results_pil if output_type == "pil" else results_pt)
+def de_transform(
+    tensor: torch.Tensor,
+    mean=(0.48145466, 0.4578275, 0.40821073),
+    std=(0.26862954, 0.26130258, 0.27577711),
+    rescale_factor: float = 1 / 255,
+) -> torch.Tensor:
+    """De-normalize and de-rescale, suitable for images processed by Qwen2VLImageProcessor."""
+    if tensor.ndim == 3:
+        tensor = tensor.unsqueeze(0)
+    mean_t = torch.tensor(mean).view(1, -1, 1, 1).to(tensor.device)
+    std_t = torch.tensor(std).view(1, -1, 1, 1).to(tensor.device)
+    tensor = tensor * std_t + mean_t
+    tensor = tensor / rescale_factor
+    tensor = torch.clamp(tensor / 255.0, 0, 1)
+    return tensor
+def tensor2pil(image_t: torch.Tensor, image_mean, image_std) -> Image.Image:
+    """Convert a tensor to a PIL Image."""
+    image_t = image_t.detach().cpu()
+    rescale_factor = 1 / 255
+    sample = de_transform(
+        image_t,
+        mean=image_mean,
+        std=image_std,
+        rescale_factor=rescale_factor,
+    )[0]
+    ndarr = sample.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+    return Image.fromarray(ndarr)

modeling_longcat_next.py ADDED Viewed

	@@ -0,0 +1,824 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2026 Meituan
+# This code is licensed under the MIT License, for details, see the ./LICENSE file.
+import os
+from dataclasses import dataclass
+from tqdm import tqdm
+from typing import Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.cache_utils import Cache
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.utils import GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput, GenerateNonBeamOutput
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.longcat_flash.modeling_longcat_flash import LongcatFlashForCausalLM
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from .configuration_longcat_next import LongcatNextConfig
+from .modeling_longcat_ngram import LongcatFlashNgramModel, NgramCache
+from .modular_longcat_next import CasualDepthTransformerHead
+from .modular_longcat_next_audio import LongcatNextAudioTokenizer
+from .modular_longcat_next_visual import LongcatNextVisualTokenizer
+from .cosy24k_vocoder import Cosy24kVocoder
+from .image_refiner import ImageRefinerContainer
+from .refiner_modules import FlowMatchEulerDiscreteScheduler
+logger = logging.get_logger(__name__)
+@dataclass
+class LongcatNextForCausalLMOutputWithPast(CausalLMOutputWithPast):
+    visual_loss: Optional[torch.FloatTensor] = None
+    visual_logits: Optional[torch.FloatTensor] = None
+    visual_ids: Optional[torch.LongTensor] = None
+    audio_loss: Optional[torch.FloatTensor] = None
+    audio_logits: Optional[torch.FloatTensor] = None
+    audio_ids: Optional[torch.LongTensor] = None
+@dataclass
+class LongcatNextForCausalLMGenerateDecoderOnlyOutput(GenerateDecoderOnlyOutput):
+    visual_ids: Optional[torch.LongTensor] = None
+    audio_ids: Optional[torch.LongTensor] = None
+    audio_text_ids: Optional[torch.LongTensor] = None
+@dataclass
+class LongcatNextForCausalLMGenerateEncoderDecoderOutput(GenerateEncoderDecoderOutput):
+    visual_ids: Optional[torch.LongTensor] = None
+    audio_ids: Optional[torch.LongTensor] = None
+    audio_text_ids: Optional[torch.LongTensor] = None
+@dataclass
+class LongcatNextForCausalLMGenerationStatus:
+    mode: str = "text"
+    current_image_token_num: int = -1
+    audio_parallel_decoding: bool = False
+    is_audio_text_end: bool = False
+    is_audio_start: bool = False
+    last_step_mode: str = None
+    def __init__(self, visual_generation_config, audio_generation_config):
+        self.visual_generation_config = visual_generation_config
+        self.h = self.visual_generation_config.custom_params["token_h"]
+        self.w = self.visual_generation_config.custom_params["token_w"]
+        self.anyres_prefix = self.visual_generation_config.custom_params["anyres_prefix"].format(h=self.h, w=self.w)
+        self.audio_generation_config = audio_generation_config
+        self.audio_parallel_decoding = audio_generation_config.audio_parallel_decoding
+    def switch_to(self, modal):
+        assert modal in ["text", "visual", "audio"]
+        self.mode = modal
+        self.current_image_token_num = 0 if modal == "visual" else -1
+        self.is_audio_text_end = False
+        self.is_audio_start = False
+    @property
+    def is_img_newline(self):
+        return ((self.current_image_token_num + 1) % (self.w + 1)) == 0 and not self.is_img_end
+    @property
+    def is_img_end(self):
+        return (self.current_image_token_num + 1) / (self.w + 1) == self.h
+class LongcatNextModel(LongcatFlashNgramModel):
+    _keys_to_ignore_on_load_unexpected = [r"model\.mtp.*"]
+    config_class = LongcatNextConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual_tokenizer = LongcatNextVisualTokenizer(config)
+        self.audio_tokenizer = LongcatNextAudioTokenizer(config)
+        self._init_multimodal_constants(config)
+        self.post_init()
+    def _init_multimodal_constants(self, config):
+        name2id_dict = {
+            "image_newline_token_id": self.config.visual_config.image_newline_token_id,
+            "image_end_token_id": self.config.visual_config.image_end_token_id,
+            "image_pad_token_id": self.config.visual_config.image_pad_token_id,
+            "audiotext_start_token_id": config.audio_config.audiotext_start_token_id,
+            "audiotext_pad_token_id": self.config.audio_config.audiotext_pad_token_id,
+            "audiogen_end_token_id": config.audio_config.audiogen_end_token_id,
+            "audio_pad_token_id": self.config.audio_config.audio_pad_token_id,
+        }
+        for k, v in name2id_dict.items():
+            self.register_buffer(k, torch.tensor([v], dtype=torch.long), persistent=False)
+        visual_offset_list = [config.visual_offset] + config.visual_config.vq_config.codebook_sizes[:-1]
+        visual_offset_vals = torch.cumsum(torch.tensor(visual_offset_list, dtype=torch.long), dim=0)
+        self.register_buffer("visual_offset_vals", visual_offset_vals, persistent=False)
+        audio_offset_list = [config.audio_offset] + config.audio_config.vq_config.codebook_sizes[:-1]
+        audio_offset_vals = torch.cumsum(torch.tensor(audio_offset_list, dtype=torch.long), dim=0)
+        self.register_buffer("audio_offset_vals", audio_offset_vals, persistent=False)
+        print(f"{self.visual_offset_vals=}")
+        print(f"{self.audio_offset_vals=}")
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        visual_inputs=None,
+        visual_ids=None,
+        audio_inputs=None,
+        audio_ids=None,
+        audio_text_ids=None,
+        multimodal_generation_status=None,
+        **kwargs
+    ) -> BaseModelOutputWithPast:
+        if input_ids is None:
+            raise ValueError("You must specify input_ids")
+        # Extract N-gram context if available
+        ngram_context = None
+        if isinstance(past_key_values, NgramCache) and past_key_values.ngram_context is not None:
+            ngram_context = past_key_values.ngram_context
+        # assert input_ids.size(0) == 1, "only support bs=1 for now" # but when bs=2, idx=1 is for uncond_image_generation
+        special_visual_mask, special_audio_mask, special_audio_text_start_mask, special_audio_text_pad_mask = self.get_placeholder_mask(input_ids[:1]) # seq-dim
+        if inputs_embeds is None:
+            input_ids[:, special_visual_mask | special_audio_mask | special_audio_text_pad_mask | special_audio_text_start_mask] = 0
+            filled_text_pad_mask = torch.ones_like(special_audio_mask)
+            audio_text_position_mask = (special_audio_text_pad_mask | special_audio_text_start_mask | special_audio_mask)
+            if audio_text_ids is not None and audio_text_ids.size(1) > 0 and audio_text_position_mask.sum() > 0:
+                filled_text = audio_text_ids[:, -audio_text_position_mask.sum():]
+                filled_text_pad_mask = (filled_text==self.config.audio_config.audiotext_pad_token_id)[0]
+                input_ids[:, audio_text_position_mask] = filled_text
+                input_ids[input_ids == self.config.audio_config.audiotext_pad_token_id] = 0
+            inputs_embeds = self.ngram_embeddings(input_ids, ngram_context=ngram_context)
+            inputs_embeds[:, (special_visual_mask | (special_audio_mask & filled_text_pad_mask))] = 0
+        if special_audio_text_start_mask.sum() > 0:
+            audio_text_start_embedding = self.embed_tokens(self.audiotext_start_token_id)
+            if multimodal_generation_status.last_step_mode is None: # prefill
+                inputs_embeds[:1, special_audio_text_start_mask] += audio_text_start_embedding
+            else:
+                inputs_embeds[:, special_audio_text_start_mask] += audio_text_start_embedding
+        if visual_inputs is not None:
+            visual_ids = self.get_visual_ids(**visual_inputs) # [<bs=1>*seq, lev]
+        if visual_ids is not None and special_visual_mask.sum() > 0:
+            visual_embeddings = self.get_visual_embeddings(visual_ids[-special_visual_mask.sum():]) # -> [seq, dim]
+            if multimodal_generation_status.last_step_mode is None: # prefill
+                inputs_embeds[:1, special_visual_mask] = visual_embeddings.to(inputs_embeds.device)
+            else:
+                inputs_embeds[:, special_visual_mask] = visual_embeddings.to(inputs_embeds.device)
+        if audio_inputs is not None:
+            audio_ids = self.get_audio_ids(**audio_inputs) # -> [<bs=1>*seq, lev]
+        if audio_ids is not None and special_audio_mask.sum() > 0:
+            audio_embeddings = self.get_audio_embeddings(audio_ids[-special_audio_mask.sum():]) # -> [seq, dim]
+            if multimodal_generation_status.last_step_mode is None: # prefill
+                inputs_embeds[:1, special_audio_mask] += audio_embeddings.to(inputs_embeds.device)
+            else:
+                inputs_embeds[:, special_audio_mask] += audio_embeddings.to(inputs_embeds.device)
+        # Initialize NgramCache if needed
+        if use_cache and past_key_values is None:
+            past_key_values = NgramCache(config=self.config)
+        # Update N-gram context
+        if use_cache and isinstance(past_key_values, NgramCache):
+            past_key_values.update_ngram_context(input_ids)
+        return super().forward(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            **kwargs
+        )
+    def get_visual_ids(self, pixel_values, visual_grid_thw, offset=True):
+        visual_ids = self.visual_tokenizer.encode(pixel_values, visual_grid_thw)
+        if offset:
+            visual_ids += self.visual_offset_vals.to(visual_ids.device)
+        return visual_ids
+    def get_audio_ids(self, audio, encoder_length, bridge_length, offset=True):
+        audio_ids = self.audio_tokenizer.encode(audio, encoder_length, bridge_length)
+        if offset:
+            audio_ids += self.audio_offset_vals.to(audio_ids.device)
+        return audio_ids
+    @torch.no_grad()
+    def decode_visual_ids_and_save(
+        self,
+        visual_ids,
+        save_prefix,
+        token_h,
+        token_w,
+        **kwargs,
+    ):
+        visual_ids -= self.visual_offset_vals.to(visual_ids.device)
+        if not (save_prefix.startswith("./") or save_prefix.startswith("/")):
+            save_prefix = f"./{save_prefix}"
+        os.makedirs(os.path.dirname(save_prefix), exist_ok=True)
+        return self.visual_tokenizer.lazy_decode_and_save(visual_ids, token_h, token_w, f"{save_prefix}_{0}.png")
+    @torch.no_grad()
+    def decode_audio_ids_and_save(
+        self,
+        audio_ids,
+        save_prefix,
+        sampling_rate,
+        wave_concat_overlap,
+        **kwargs,
+    ):
+        audio_ids -= self.audio_offset_vals.to(audio_ids.device)
+        if not (save_prefix.startswith("./") or save_prefix.startswith("/")):
+            save_prefix = f"./{save_prefix}"
+        os.makedirs(os.path.dirname(save_prefix), exist_ok=True)
+        save_path = f"{save_prefix}_{0}.wav"
+        self.audio_tokenizer.lazy_decode_and_save(audio_ids, sampling_rate, wave_concat_overlap, save_path)
+        return [save_path]
+    def get_visual_embeddings(self, visual_ids):
+        visual_embeddings = self.embed_tokens(visual_ids).sum(dim=1) # [seq, lev] -> [seq, lev, dim] -> [seq, dim]
+        visual_embeddings = self.visual_tokenizer.visual_embedding_layer(visual_embeddings)
+        return visual_embeddings
+    def get_audio_embeddings(self, audio_ids):
+        audio_embeddings = self.embed_tokens(audio_ids).sum(dim=1)
+        return audio_embeddings
+    def get_placeholder_mask(self, input_ids: torch.LongTensor):
+        special_image_mask = (input_ids == self.config.visual_config.image_pad_token_id).squeeze(0)
+        special_audio_mask = (input_ids == self.config.audio_config.audio_pad_token_id).squeeze(0)
+        special_audio_text_start_mask = (input_ids == self.config.audio_config.audiotext_start_token_id).squeeze(0)
+        special_audio_text_pad_mask = (input_ids == self.config.audio_config.audiotext_pad_token_id).squeeze(0)
+        return special_image_mask, special_audio_mask, special_audio_text_start_mask, special_audio_text_pad_mask
+class LongcatNextForCausalLM(LongcatFlashForCausalLM):
+    _keys_to_ignore_on_load_unexpected = [r"model\.mtp.*"]
+    _no_split_modules = [
+        "LongcatFlashDecoderLayer",
+        "CasualDepthTransformerHead",
+    ]
+    config_class = LongcatNextConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = LongcatNextModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.text_vocab_plus_multimodal_special_token_size, bias=False)
+        self.visual_head = CasualDepthTransformerHead(
+            hidden_size=config.hidden_size,
+            codebook_sizes=config.visual_config.vq_config.codebook_sizes,
+            transformer_layer_num=config.visual_config.image_head_transformer_layers,
+            transformer_dim=config.visual_config.image_head_transformer_dims,
+            transformer_ffn_scale=config.visual_config.image_head_transformer_ffn_scale,
+        )
+        self.audio_head = CasualDepthTransformerHead(
+            hidden_size=config.hidden_size,
+            codebook_sizes=config.audio_config.vq_config.codebook_sizes,
+            transformer_layer_num=config.audio_config.audio_head_transformer_layers,
+            transformer_dim=config.audio_config.audio_head_transformer_dims,
+            transformer_ffn_scale=config.audio_config.audio_head_transformer_ffn_scale,
+        )
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        visual_inputs=None,
+        visual_ids=None,
+        audio_inputs=None,
+        audio_ids=None,
+        audio_text_ids=None,
+        multimodal_generation_status: LongcatNextForCausalLMGenerationStatus = None,
+        visual_generation_config: GenerationConfig = None,
+        audio_generation_config: GenerationConfig = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        visual_inputs (`BatchFeature`, *optional*):
+            Visual inputs returned by the processor, containing pixel values and grid metadata for image encoding.
+        visual_ids (`torch.LongTensor` of shape `(num_visual_tokens, num_codebooks)`, *optional*):
+            Quantized visual token ids from the visual tokenizer, used to build visual embeddings during generation.
+        audio_inputs (`BatchFeature`, *optional*):
+            Audio inputs returned by the processor, containing mel-spectrogram features and length metadata.
+        audio_ids (`torch.LongTensor` of shape `(num_audio_tokens, num_codebooks)`, *optional*):
+            Quantized audio token ids from the audio tokenizer, used to build audio embeddings during generation.
+        audio_text_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Token ids for the audio text transcript generated alongside audio tokens.
+        multimodal_generation_status (`LongcatNextForCausalLMGenerationStatus`, *optional*):
+            Stateful object tracking the current multimodal generation mode (text / visual / audio) and
+            associated counters used to route logits to the correct head during auto-regressive decoding.
+        visual_generation_config (`GenerationConfig`, *optional*):
+            Generation configuration for the visual head, controlling sampling parameters such as
+            `temperature`, `top_k`, `top_p`, and custom parameters like `cfg_scale` and `anyres_config`.
+        audio_generation_config (`GenerationConfig`, *optional*):
+            Generation configuration for the audio head, controlling sampling parameters such as
+            `temperature`, `top_k`, `top_p`, `repetition_penalty`, and `audio_parallel_decoding`.
+        """
+        if multimodal_generation_status.mode == "visual" and visual_generation_config.custom_params["cfg_scale"] != 1.0 and input_ids.size(0) == 1:
+            input_ids = input_ids.repeat((2, 1))
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            visual_inputs=visual_inputs,
+            visual_ids=visual_ids,
+            audio_inputs=audio_inputs,
+            audio_ids=audio_ids,
+            audio_text_ids=audio_text_ids,
+            multimodal_generation_status=multimodal_generation_status,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        slice_hidden_states = hidden_states[:, slice_indices, :]
+        loss, logits = None, None
+        if multimodal_generation_status.mode == "visual" and \
+            (not multimodal_generation_status.is_img_newline) and (not multimodal_generation_status.is_img_end):
+            visual_ids = self.get_multimodal_logits_and_ids(
+                self.visual_head,
+                visual_ids,
+                slice_hidden_states,
+                self.model.embed_tokens,
+                self.config.visual_config.vq_config.codebook_sizes,
+                self.model.visual_offset_vals,
+                visual_generation_config,
+            )
+        else:
+            logits = self.lm_head(slice_hidden_states)
+        if multimodal_generation_status.mode == "audio" and multimodal_generation_status.is_audio_start:
+            audio_ids = self.get_multimodal_logits_and_ids(
+                self.audio_head,
+                audio_ids,
+                slice_hidden_states,
+                self.model.embed_tokens,
+                self.config.audio_config.vq_config.codebook_sizes,
+                self.model.audio_offset_vals,
+                audio_generation_config,
+            )
+        return LongcatNextForCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            visual_ids=visual_ids,
+            audio_ids=audio_ids,
+        )
+    def get_multimodal_logits_and_ids(
+        self,
+        head_model,
+        multimodal_ids,
+        hidden_states,
+        multimodal_embedding_layer,
+        codebook_sizes,
+        offset_vals,
+        multimodal_generation_config,
+    ):
+        next_token_ids = torch.zeros(hidden_states.size(0), len(codebook_sizes), dtype=torch.long, device=hidden_states.device)
+        multimodal_embedding_layer = multimodal_embedding_layer.to(hidden_states.device)
+        for level, _ in enumerate(codebook_sizes):
+            logits = head_model(hidden_states, next_token_ids, multimodal_embedding_layer, level) # -> (bs, 1, dim)
+            next_token_id = self.inner_sample(logits, multimodal_ids[None, :, level]-offset_vals[level], multimodal_generation_config) # (bs, 1)
+            next_token_id += offset_vals[level]
+            next_token_ids[:, level] = next_token_id
+        return next_token_ids[:1]
+    def inner_sample(
+        self,
+        next_token_logits: torch.Tensor,
+        multimodal_ids: torch.LongTensor,
+        generation_config: GenerationConfig,
+    ) -> torch.Tensor:
+        logits_processor = self._get_logits_processor(generation_config)
+        if "cfg_scale" in generation_config.custom_params and generation_config.custom_params["cfg_scale"] != 1.0:
+            cond_logits, uncond_logits = next_token_logits.chunk(2, dim=0)
+            next_token_logits = generation_config.custom_params["cfg_scale"] * (cond_logits - uncond_logits) + uncond_logits
+        next_token_scores = logits_processor(multimodal_ids, next_token_logits.to(multimodal_ids.device))
+        if generation_config.do_sample:
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        else:
+            next_tokens = torch.argmax(next_token_scores, dim=-1)
+        return next_tokens
+    @torch.no_grad()
+    def generate(self, inputs=None, **kwargs):
+        """Override to ensure NgramCache is used."""
+        if "past_key_values" not in kwargs or kwargs["past_key_values"] is None:
+            kwargs["past_key_values"] = NgramCache(config=self.config)
+        return super().generate(
+            inputs=inputs,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        visual_ids,
+        audio_ids,
+        audio_text_ids,
+        multimodal_generation_status,
+        generation_config,
+        attention_mask,
+        cache_position,
+        **kwargs,
+    ):
+        extra_new_tokens = torch.empty(input_ids.size(0), 0, dtype=torch.long, device=input_ids.device)
+        if visual_ids is None:
+            visual_ids = torch.empty(0, len(self.config.visual_config.vq_config.codebook_sizes), dtype=torch.long, device=input_ids.device)
+        if audio_ids is None:
+            audio_ids = torch.empty(0, len(self.config.audio_config.vq_config.codebook_sizes), dtype=torch.long, device=input_ids.device)
+        if audio_text_ids is None:
+            audio_text_ids = torch.empty(input_ids.size(0), 0, dtype=torch.long, device=input_ids.device)
+        def insert_ids(new_ids, _input_ids, _attention_mask, _cache_position, position=0):
+            if position < 0:
+                parts = [_input_ids[:, :position], new_ids, _input_ids[:, position:]]
+            else:
+                parts = [_input_ids, new_ids]
+            _input_ids = torch.cat(parts, dim=1)
+            insert_len = new_ids.size(1)
+            _attention_mask = F.pad(_attention_mask, (0, insert_len), value=1)
+            insert_position = _cache_position[-1] + 1 + torch.arange(insert_len, device=_cache_position.device)
+            _cache_position = torch.cat([_cache_position, insert_position])
+            return _input_ids, _attention_mask, _cache_position
+        # multimodal generation status change
+        if cache_position[0] != 0:
+            multimodal_generation_status.last_step_mode = multimodal_generation_status.mode
+        if multimodal_generation_status.mode == "visual":
+            multimodal_generation_status.current_image_token_num += 1
+        if (input_ids[:, -1] == self.config.visual_config.image_start_token_id).all():
+            multimodal_generation_status.switch_to("visual")
+            anyres_prefix_ids = self.text_tokenizer.encode(multimodal_generation_status.anyres_prefix, return_tensors="pt")
+            anyres_prefix_ids = anyres_prefix_ids.to(input_ids.device)
+            extra_new_tokens = torch.cat([extra_new_tokens, anyres_prefix_ids], dim=1)
+            input_ids, attention_mask, cache_position = insert_ids(anyres_prefix_ids, input_ids, attention_mask, cache_position, position=-1)
+            if input_ids.size(0) == 1: # cfg, change bs=1 -> 2
+                input_ids = input_ids.repeat((2, input_ids.size(1)))
+                input_ids[1, :-(anyres_prefix_ids.size(-1)+1)] = 0
+                print(f"change to cfg, input_ids: {input_ids}")
+                attention_mask = attention_mask.repeat((2, attention_mask.size(1)))
+        elif (input_ids[:, -1] == self.config.audio_config.audiogen_start_token_id).all():
+            multimodal_generation_status.switch_to("audio")
+        elif (input_ids[:, -1] == self.config.audio_config.audiotext_start_token_id).all():
+            multimodal_generation_status.is_audio_start = True
+        elif ((input_ids[:, -1] == self.config.visual_config.image_end_token_id) | (input_ids[:, -1] == self.config.audio_config.audiogen_end_token_id)).all():
+            multimodal_generation_status.switch_to("text")
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids=input_ids,
+            visual_ids=visual_ids,
+            audio_ids=audio_ids,
+            audio_text_ids=audio_text_ids,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        if model_inputs["cache_position"][0] != 0:
+            model_inputs["visual_inputs"] = None
+            model_inputs["audio_inputs"] = None
+        return model_inputs, multimodal_generation_status, extra_new_tokens
+    def _sample(
+        self,
+        input_ids: torch.LongTensor,
+        logits_processor: LogitsProcessorList,
+        stopping_criteria: StoppingCriteriaList,
+        generation_config: GenerationConfig,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        visual_ids=None,
+        audio_ids=None,
+        audio_text_ids=None,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
+            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config._pad_token_tensor
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # keep track of which sequences are already finished
+        batch_size, cur_len = input_ids.shape[:2]
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
+        model_forward = self.__call__
+        compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
+        if compile_forward:
+            os.environ["TOKENIZERS_PARALLELISM"] = "0"
+            # If we use FA2 and a static cache, we cannot compile with fullgraph
+            if self.config._attn_implementation == "flash_attention_2":
+                # only raise warning if the user passed an explicit compile-config
+                if generation_config.compile_config is not None and generation_config.compile_config.fullgraph:
+                    logger.warning_once(
+                        "When using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
+                        "FA2 introduces graph breaks. We overrode the option with `fullgraph=False`."
+                    )
+                    generation_config.compile_config.fullgraph = False
+            model_forward = self.get_compiled_call(generation_config.compile_config)
+        if generation_config.prefill_chunk_size is not None:
+            model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
+            is_prefill = False
+        else:
+            is_prefill = True
+        visual_generation_config = GenerationConfig(**generation_config.visual_generation_config)
+        audio_generation_config = GenerationConfig(**generation_config.audio_generation_config)
+        multimodal_generation_status = LongcatNextForCausalLMGenerationStatus(visual_generation_config, audio_generation_config)
+        pbar = tqdm(iter(int, 1), desc="Generating", unit="tok")
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # prepare model inputs
+            model_inputs, multimodal_generation_status, extra_new_tokens = self.prepare_inputs_for_generation(
+                input_ids,
+                visual_ids,
+                audio_ids,
+                audio_text_ids,
+                multimodal_generation_status,
+                generation_config,
+                **model_kwargs,
+            )
+            if extra_new_tokens.size(1) > 0:
+                input_ids = torch.cat([input_ids[:, :-1], extra_new_tokens, input_ids[:, -1:]], dim=1)
+                model_kwargs["attention_mask"] = model_inputs["attention_mask"]
+                model_kwargs["cache_position"] = model_inputs["cache_position"]
+            if multimodal_generation_status.mode == "text" and multimodal_generation_status.last_step_mode == "visual":
+                next_tokens = generation_config._eos_token_tensor
+                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+                if streamer is not None:
+                    streamer.put(next_tokens.cpu())
+                break
+            visual_ids = model_inputs["visual_ids"]
+            audio_ids = model_inputs["audio_ids"]
+            audio_text_ids = model_inputs["audio_text_ids"]
+            if is_prefill:
+                outputs = self(**model_inputs, return_dict=True, multimodal_generation_status=multimodal_generation_status, visual_generation_config=visual_generation_config, audio_generation_config=audio_generation_config)
+                is_prefill = False
+            else:
+                outputs = model_forward(**model_inputs, return_dict=True, multimodal_generation_status=multimodal_generation_status, visual_generation_config=visual_generation_config, audio_generation_config=audio_generation_config)
+            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                num_new_tokens=1,
+            )
+            if synced_gpus and this_peer_finished:
+                continue
+            # multimodal generation
+            if multimodal_generation_status.mode == "text" or \
+                (multimodal_generation_status.mode == "audio" and not multimodal_generation_status.is_audio_text_end):
+                # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+                # (the clone itself is always small)
+                next_token_logits = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
+                # pre-process distribution
+                next_token_scores = logits_processor(input_ids, next_token_logits)
+                # Store scores, attentions and hidden_states when required
+                if return_dict_in_generate:
+                    if output_scores:
+                        scores += (next_token_scores,)
+                    if output_logits:
+                        raw_logits += (next_token_logits,)
+                    if output_attentions:
+                        decoder_attentions += (
+                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                        )
+                        if self.config.is_encoder_decoder:
+                            cross_attentions += (outputs.cross_attentions,)
+                    if output_hidden_states:
+                        decoder_hidden_states += (
+                            (outputs.decoder_hidden_states,)
+                            if self.config.is_encoder_decoder
+                            else (outputs.hidden_states,)
+                        )
+                # token selection
+                if do_sample:
+                    probs = nn.functional.softmax(next_token_scores, dim=-1)
+                    # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
+                    next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+                else:
+                    next_tokens = torch.argmax(next_token_scores, dim=-1)
+                # audio_text_ids done
+                if multimodal_generation_status.mode == "audio" and (next_tokens == self.config.audio_config.audiotext_pad_token_id).all():
+                    multimodal_generation_status.is_audio_text_end = True
+            elif multimodal_generation_status.mode == "visual":
+                if multimodal_generation_status.is_img_end:
+                    next_tokens = self.model.image_end_token_id.to(input_ids.device)
+                elif multimodal_generation_status.is_img_newline:
+                    next_tokens = self.model.image_newline_token_id.to(input_ids.device)
+                else:
+                    visual_ids = torch.cat([visual_ids, outputs.visual_ids], dim=0) # [seq, lev]
+                    next_tokens = self.model.image_pad_token_id.to(input_ids.device)
+            else: # mode == "audio" and multimodal_generation_status.is_audio_text_end
+                next_tokens = self.model.audio_pad_token_id.to(input_ids.device)
+            if multimodal_generation_status.mode == "audio":
+                # audio_text_ids update
+                audio_text_next_tokens = self.model.audiotext_pad_token_id.to(input_ids.device)
+                if not multimodal_generation_status.is_audio_text_end:
+                    audio_text_next_tokens, next_tokens = next_tokens, audio_text_next_tokens
+                audio_text_ids = torch.cat((audio_text_ids, audio_text_next_tokens[:, None]), dim=1)
+                # audio_ids update
+                if multimodal_generation_status.is_audio_start:
+                    if outputs.audio_ids[-1, 0] == (self.model.audio_offset_vals[1]): # offset + (level_1_len)
+                        next_tokens = self.model.audiogen_end_token_id.to(input_ids.device)
+                    else:
+                        next_tokens = self.model.audio_pad_token_id.to(input_ids.device)
+                    audio_ids = torch.cat([audio_ids, outputs.audio_ids], dim=0)
+                elif (multimodal_generation_status.audio_parallel_decoding) or \
+                        (not multimodal_generation_status.audio_parallel_decoding and multimodal_generation_status.is_audio_text_end):
+                        next_tokens = self.model.audiotext_start_token_id.to(input_ids.device)
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            # TODO: streaming mm ids
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+            cur_len += 1
+            # This is needed to properly delete outputs.logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            del outputs
+            pbar.update(1)
+            pbar.set_postfix({
+                "recent_5toks": f"{input_ids[:, -5:].tolist()}",
+            })
+        pbar.close()
+        if streamer is not None:
+            streamer.end()
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return LongcatNextForCausalLMGenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                    visual_ids=visual_ids,
+                    audio_ids=audio_ids,
+                    audio_text_ids=audio_text_ids,
+                )
+            else:
+                return LongcatNextForCausalLMGenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                    visual_ids=visual_ids,
+                    audio_ids=audio_ids,
+                    audio_text_ids=audio_text_ids,
+                )
+        else:
+            return input_ids, visual_ids, audio_ids, audio_text_ids
+__all__ = ["LongcatNextModel", "LongcatNextForCausalLM"]

modeling_longcat_ngram.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2026 Meituan
+# This code is licensed under the MIT License, for details, see the ./LICENSE file.
+from typing import Optional, Tuple, Dict, List
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.processing_utils import Unpack
+from transformers.utils import auto_docstring, logging
+from transformers.models.longcat_flash.modeling_longcat_flash import (
+    LongcatFlashForCausalLM,
+    LongcatFlashModel,
+    LongcatFlashRMSNorm,
+    LongcatFlashRotaryEmbedding,
+    LongcatFlashDecoderLayer,
+    LongcatFlashPreTrainedModel,
+)
+from .configuration_longcat_ngram import LongcatFlashNgramConfig
+logger = logging.get_logger(__name__)
+@auto_docstring
+class LongcatFlashNgramPreTrainedModel(LongcatFlashPreTrainedModel):
+    pass
+class NgramCache(DynamicCache):
+    """
+    Extended DynamicCache for storing N-gram context alongside KV cache.
+    """
+    def __init__(self, config=None):
+        super().__init__()
+        self.ngram_context = None
+        # Keep only n-1 tokens (minimum needed for N-gram computation)
+        self.max_context_len = config.emb_neighbor_num - 1
+        self.oe_ignored_token_ids = torch.tensor(config.oe_ignored_token_ids, dtype=torch.long)
+    def update_ngram_context(self, new_tokens: torch.Tensor) -> None:
+        """
+        Update N-gram context with window management.
+        Args:
+            new_tokens: New tokens to append, shape (batch_size, seq_len)
+        """
+        new_tokens = new_tokens.clone()
+        new_tokens[torch.isin(new_tokens, self.oe_ignored_token_ids.to(new_tokens.device))] = 0
+        if self.ngram_context is None:
+            self.ngram_context = new_tokens
+        else:
+            self.ngram_context = torch.cat([self.ngram_context, new_tokens], dim=-1)
+        # Truncate to maintain constant memory footprint
+        if self.ngram_context.size(-1) > self.max_context_len:
+            self.ngram_context = self.ngram_context[..., -self.max_context_len:]
+    def reorder_cache(self, beam_idx: torch.LongTensor) -> "Cache":
+        """Reorder cache for beam search."""
+        # Reorder parent's KV cache
+        super().reorder_cache(beam_idx)
+        # Reorder N-gram context
+        if self.ngram_context is not None:
+            self.ngram_context = self.ngram_context.index_select(0, beam_idx.to(self.ngram_context.device))
+        return self
+class EmbeddingWithMask(nn.Embedding):
+    def forward(self, input: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Input indices of shape (batch_size, seq_len)
+            mask (torch.Tensor): Boolean mask of shape (batch_size, seq_len).
+                                 True means compute, False means skip and return 0.
+        Returns:
+            torch.Tensor: Embeddings of shape (batch_size, seq_len, embedding_dim)
+        """
+        if mask is not None:
+            # Ensure mask is boolean
+            mask = mask.bool()
+        else:
+            mask = torch.ones_like(input, dtype=torch.bool)
+        batch_size, seq_len = input.shape
+        embedding_dim = self.embedding_dim
+        # 1. Initialize the output tensor with zeros on the correct device
+        output = torch.zeros(
+            (batch_size, seq_len, embedding_dim),
+            device=input.device,
+            dtype=self.weight.dtype
+        )
+        # 2. Filter out the valid indices using the mask
+        # valid_indices is a 1D tensor containing only the elements where mask is True
+        valid_indices = input[mask]
+        # 3. Only perform the embedding lookup if there is at least one valid index
+        if valid_indices.numel() > 0:
+            # Look up only the necessary embeddings (saves compute/memory bandwidth)
+            valid_embeddings = F.embedding(
+            valid_indices, self.weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+            # 4. Scatter the valid embeddings back to their original positions in the output tensor
+            output[mask] = valid_embeddings
+        return output
+class NgramEmbedding(nn.Module):
+    """
+    Computes embeddings enriched with N-gram features without maintaining internal state.
+    """
+    def __init__(self, config, base_embeddings):
+        super().__init__()
+        self.config = config
+        self.word_embeddings = base_embeddings
+        # self.m = config.ngram_vocab_size_ratio * config.vocab_size
+        self.m = config.ngram_vocab_size_ratio * config.text_vocab_size
+        self.k = config.emb_split_num
+        self.n = config.emb_neighbor_num
+        self.oe_ignored_token_ids = torch.tensor(config.oe_ignored_token_ids)
+        self._init_ngram_embeddings()
+        self._vocab_mods_cache = None
+    def _init_ngram_embeddings(self) -> None:
+        """Initialize N-gram embedding and projection layers."""
+        num_embedders = self.k * (self.n - 1)
+        emb_dim = self.config.hidden_size // num_embedders
+        embedders = []
+        post_projs = []
+        for i in range(num_embedders):
+            vocab_size = int(self.m + i * 2 + 1)
+            emb = EmbeddingWithMask(vocab_size, emb_dim, padding_idx=self.config.pad_token_id)
+            proj = nn.Linear(emb_dim, self.config.hidden_size, bias=False)
+            embedders.append(emb)
+            post_projs.append(proj)
+        self.embedders = nn.ModuleList(embedders)
+        self.post_projs = nn.ModuleList(post_projs)
+    def _shift_right_ignore_eos(self, tensor: torch.Tensor, n: int, eos_token_id: int = 2) -> torch.Tensor:
+        p, q = tensor.shape
+        # special_token / modal set 0
+        special_tokens = 0
+        if n == 0:
+            return tensor.clone()
+        if n >= q:
+            return torch.zeros_like(tensor)
+        result = torch.zeros_like(tensor)
+        # Find all special_token/modal/EOS locations
+        special_mask = (tensor == special_tokens)
+        total_mask = (tensor == eos_token_id | special_mask)
+        # Calculate the segment ID to which each position belongs
+        eos_cumsum = total_mask.long().cumsum(dim=1)
+        # Shift right by 1, so that the first EOS position still belongs to segment 0, and the second EOS position belongs to segment 1
+        segment_ids = torch.cat([
+            torch.zeros(p, 1, dtype=torch.long, device=tensor.device),
+            eos_cumsum[:, :-1]
+        ], dim=1)
+        col_indices = torch.arange(q, device=tensor.device).unsqueeze(0).expand(p, q)
+        # Number of segments
+        max_segments = segment_ids.max().item() + 1
+        segment_starts = torch.full((p, max_segments), q, dtype=torch.long, device=tensor.device)
+        # Calculate the starting position of each segment
+        segment_starts.scatter_reduce_(1, segment_ids, col_indices, reduce='amin', include_self=False)
+        # Get the start position of the segment to which each position belongs
+        segment_start_per_pos = torch.gather(segment_starts, 1, segment_ids)
+        # Calculate the offset of each position within the segment
+        offset_in_segment = col_indices - segment_start_per_pos
+        # Data for each position should be taken from the position offset -n within the segment
+        source_offset = offset_in_segment - n
+        valid_mask = source_offset >= 0
+        # Calculate the actual source index
+        source_indices = segment_start_per_pos + torch.clamp(source_offset, min=0)
+        # Data is collected by source_indices
+        result = torch.gather(tensor, 1, source_indices)
+        # Set invalid position to zero
+        result = result * valid_mask * (~special_mask)
+        return result
+    def _precompute_vocab_mods(self) -> Dict[Tuple[int, int], List[int]]:
+        """Precompute modular arithmetic values for vocabulary."""
+        if self._vocab_mods_cache is not None:
+            return self._vocab_mods_cache
+        vocab_mods = {}
+        vocab_size = self.config.text_vocab_size
+        for i in range(2, self.n + 1):
+            for j in range(self.k):
+                index = (i - 2) * self.k + j
+                emb_vocab_dim = int(self.m + index * 2 + 1)
+                mods = []
+                power_mod = 1
+                for _ in range(i - 1):
+                    power_mod = (power_mod * vocab_size) % emb_vocab_dim
+                    mods.append(power_mod)
+                vocab_mods[(i, j)] = mods
+        self._vocab_mods_cache = vocab_mods
+        return vocab_mods
+    def _get_ngram_ids(
+        self,
+        input_ids: torch.Tensor,
+        shifted_ids: Dict[int, torch.Tensor],
+        vocab_mods: List[int],
+        ngram: int
+    ) -> torch.Tensor:
+        """Compute N-gram hash IDs using polynomial rolling hash."""
+        ngram_ids = input_ids.clone()
+        for k in range(2, ngram + 1):
+            ngram_ids = ngram_ids + shifted_ids[k] * vocab_mods[k - 2]
+        return ngram_ids
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        ngram_context: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Stateless forward pass.
+        Args:
+            input_ids: Current input token IDs of shape (batch_size, seq_len)
+            ngram_context: Optional historical context of shape (batch_size, context_len)
+        Returns:
+            Embedding tensor of shape (batch_size, seq_len, hidden_size)
+        """
+        seq_len = input_ids.size(-1)
+        # Determine complete context
+        if ngram_context is not None:
+            context = torch.cat([ngram_context[..., -(self.n-1):], input_ids], dim=-1)
+        else:
+            context = input_ids.clone()
+        # Skip N-gram look-up for oe_ignored_token_ids
+        oe_ignored_mask = torch.isin(input_ids, self.oe_ignored_token_ids.to(device=input_ids.device))
+        context[torch.isin(context, self.oe_ignored_token_ids.to(device=context.device))] = 0
+        # Base word embeddings
+        device = self.word_embeddings.weight.device
+        x = self.word_embeddings(input_ids.to(device)).clone()
+        # Precompute modular values
+        vocab_mods = self._precompute_vocab_mods()
+        # Compute shifted IDs
+        shifted_ids = {}
+        for i in range(2, self.n + 1):
+            shifted_ids[i] = self._shift_right_ignore_eos(
+                context, i - 1, eos_token_id=self.config.eos_token_id
+            )
+        # Add N-gram embeddings
+        for i in range(2, self.n + 1):
+            for j in range(self.k):
+                index = (i - 2) * self.k + j
+                emb_vocab_dim = int(self.m + index * 2 + 1)
+                ngram_ids = self._get_ngram_ids(context, shifted_ids, vocab_mods[(i, j)], ngram=i)
+                new_ids = (ngram_ids % emb_vocab_dim)[..., -seq_len:]
+                text_mask = new_ids > 0
+                embedder_device = self.embedders[index].weight.device
+                x_ngram = self.embedders[index](new_ids.to(embedder_device), text_mask)
+                proj_device = self.post_projs[index].weight.device
+                x_proj = self.post_projs[index](x_ngram.to(proj_device))
+                x = x + x_proj.to(x.device)
+        # Normalize
+        x[~oe_ignored_mask] /= (1 + self.k * (self.n - 1))
+        return x
+class LongcatFlashNgramModel(LongcatFlashModel):
+    """LongcatFlash model with N-gram enhanced embeddings."""
+    _keys_to_ignore_on_load_unexpected = [r"model\.mtp.*"]
+    config_class = LongcatFlashNgramConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.ngram_embeddings = NgramEmbedding(config, self.embed_tokens)
+        self.layers = nn.ModuleList(
+            [LongcatFlashDecoderLayer(config, layer_idx) for layer_idx in range(config.num_layers)]
+        )
+        self.head_dim = config.head_dim
+        self.config.num_hidden_layers = 2 * config.num_layers
+        self.norm = LongcatFlashRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LongcatFlashRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        # Extract N-gram context if available
+        ngram_context = None
+        if isinstance(past_key_values, NgramCache) and past_key_values.ngram_context is not None:
+            ngram_context = past_key_values.ngram_context
+        if inputs_embeds is None:
+            inputs_embeds = self.ngram_embeddings(input_ids, ngram_context=ngram_context)
+        # Initialize NgramCache if needed
+        if use_cache and past_key_values is None:
+            past_key_values = NgramCache(config=self.config)
+        # Update N-gram context
+        if use_cache and isinstance(past_key_values, NgramCache) and input_ids is not None:
+            past_key_values.update_ngram_context(input_ids)
+        # Prepare cache position
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                inputs_embeds.shape[1], device=inputs_embeds.device
+            ) + past_seen_tokens
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # Create causal mask
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        # Forward through decoder layers
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=None,
+            attentions=None,
+        )
+class LongcatFlashNgramForCausalLM(LongcatFlashForCausalLM):
+    """LongcatFlash model for causal language modeling with N-gram embeddings."""
+    _keys_to_ignore_on_load_unexpected = [r"model\.mtp.*"]
+    config_class = LongcatFlashNgramConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LongcatFlashNgramModel(config)
+    @torch.no_grad()
+    def generate(self, inputs=None, generation_config=None, **kwargs):
+        """Override to ensure NgramCache is used."""
+        if "past_key_values" not in kwargs or kwargs["past_key_values"] is None:
+            kwargs["past_key_values"] = NgramCache(config=self.config)
+        return super().generate(inputs=inputs, generation_config=generation_config, **kwargs)
+__all__ = ["LongcatFlashNgramPreTrainedModel", "LongcatFlashNgramModel", "LongcatFlashNgramForCausalLM"]

modular_longcat_next.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from flash_attn import flash_attn_varlen_func
+from transformers.models.t5.modeling_t5 import T5LayerNorm as RMSNorm
+class FlashVarLenAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads, causal=False, window_size=(-1,-1)):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.causal = causal
+        self.window_size = window_size
+    def forward(self, hidden_states: torch.Tensor, seq_len: torch.Tensor):
+        bsz, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.view(bsz, self.num_heads, self.head_dim).contiguous()
+        key_states = self.k_proj(hidden_states)
+        key_states = key_states.view(bsz, self.num_heads, self.head_dim).contiguous()
+        value_states = self.v_proj(hidden_states)
+        value_states = value_states.view(bsz, self.num_heads, self.head_dim).contiguous()
+        cu_len = F.pad(torch.cumsum(seq_len, dim=0), (1, 0), "constant", 0).to(torch.int32)
+        max_seqlen = torch.max(seq_len).to(torch.int32).detach()
+        attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_len, cu_len, max_seqlen,
+                                             max_seqlen, causal=self.causal, window_size=self.window_size)  # (bsz * qlen, nheads, headdim)
+        attn_output = attn_output.reshape(bsz, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class CasualDepthTransformerLayer(nn.Module):
+    def __init__(self, depth, transformer_dim, transformer_ffn_scale):
+        super().__init__()
+        self.depth = depth
+        self.transformer_dim = transformer_dim
+        self.transformer_ffn_scale = transformer_ffn_scale
+        self.num_heads = self.transformer_dim // 128
+        assert self.transformer_dim % 128 == 0
+        assert self.transformer_dim % depth == 0
+        self.self_attention = FlashVarLenAttention(embed_dim=self.transformer_dim, num_heads=self.num_heads, causal=True)
+        self.layernorm1 = RMSNorm(self.transformer_dim)
+        self.layernorm2 = RMSNorm(self.transformer_dim)
+        self.linear1 = nn.Linear(self.transformer_dim, self.transformer_ffn_scale * self.transformer_dim)
+        self.linear2 = nn.Linear(self.transformer_ffn_scale * self.transformer_dim, self.transformer_dim)
+    def forward(self, x):
+        bsz = x.shape[0]
+        res = x
+        x = self.layernorm1(x)
+        seqlens = torch.tensor([self.depth] * bsz, dtype=torch.int32, device=x.device)
+        _x = self.self_attention(x.view(-1, self.transformer_dim), seqlens)
+        _x = _x.view(bsz, self.depth, self.transformer_dim).contiguous()
+        _res = _x + res  # (bs, sl, d)
+        res = self.layernorm2(_res)
+        x = torch.einsum('bld,tld->blt', res, torch.reshape(self.linear1.weight, (self.transformer_ffn_scale * self.transformer_dim // self.depth, self.depth, self.transformer_dim)))
+        x = torch.nn.functional.gelu(x)
+        x = torch.einsum('blt,dlt->bld',x, torch.reshape(self.linear2.weight, (self.transformer_dim, self.depth, self.transformer_ffn_scale * self.transformer_dim // self.depth)))
+        return _res + x
+class CasualDepthTransformerHead(nn.Module):
+    """
+    Depth-wise causal transformer head shared by image/audio heads.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        codebook_sizes,
+        transformer_layer_num,
+        transformer_dim,
+        transformer_ffn_scale,
+        gradient_checkpointing=False,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.codebook_sizes = codebook_sizes
+        self.transformer_ffn_scale = transformer_ffn_scale
+        self.gradient_checkpointing = gradient_checkpointing
+        if self.transformer_ffn_scale > 0:
+            self.hidden_norm = RMSNorm(self.hidden_size)
+            self.hidden_proj = nn.Linear(self.hidden_size, transformer_dim, bias=False)
+        self.transformer_layers = nn.ModuleList(
+            [
+                CasualDepthTransformerLayer(len(codebook_sizes), transformer_dim, transformer_ffn_scale)
+                for _ in range(transformer_layer_num)
+            ]
+        )
+        self.headnorm = RMSNorm(transformer_dim)
+        self.heads = nn.ModuleList(
+            [nn.Linear(transformer_dim, vq_size + 1) for vq_size in codebook_sizes]
+        )
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x, visual_tokens, visual_emb_layers, level):
+        main_device = "cuda:0"
+        visual_tokens = visual_tokens.to(main_device)
+        visual_emb_layers = visual_emb_layers.to(main_device)
+        cumsum_visual_embed = torch.stack([
+            visual_emb_layers(visual_tokens[..., i])
+            for i, vq_size in enumerate(self.codebook_sizes[:-1])
+            ], dim=1).to(x.device)
+        cumsum_visual_embed = torch.cumsum(cumsum_visual_embed, dim=1)  # (bs, depth-1, d)
+        hidden_states = torch.concat([x.reshape(-1, 1, self.hidden_size), cumsum_visual_embed], dim=1)  # (bs, depth, d)
+        assert hidden_states.size(1) == len(self.codebook_sizes)
+        if self.transformer_ffn_scale > 0:
+            hidden_states = self.hidden_norm(hidden_states)
+            hidden_states = self.hidden_proj(hidden_states)
+        for i, tlayer in enumerate(self.transformer_layers):
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states  = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(tlayer), hidden_states,
+                )
+            else:
+                hidden_states  = tlayer(
+                    hidden_states,
+                )
+        hidden_states = self.headnorm(hidden_states)
+        logits = self.heads[level](hidden_states[:, level])
+        return logits

modular_longcat_next_audio.py ADDED Viewed

	@@ -0,0 +1,2039 @@

+import math
+import copy
+from abc import ABC
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import numpy as np
+import torch
+import torchaudio
+from einops import pack, rearrange, repeat
+from flash_attn import flash_attn_varlen_func
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn import functional as F
+from diffusers.models.activations import get_activation
+from diffusers.models.attention import (
+    GEGLU,
+    GELU,
+    AdaLayerNorm,
+    AdaLayerNormZero,
+    ApproximateGELU,
+)
+from diffusers.models.attention_processor import Attention
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import ModelOutput
+from transformers.utils import logging
+from .cosy24k_vocoder import Cosy24kVocoder
+logger = logging.get_logger(__name__)
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+def get_sequence_mask(inputs, inputs_length):
+    if inputs.dim() == 3:
+        bsz, tgt_len, _ = inputs.size()
+    else:
+        bsz, tgt_len = inputs_length.shape[0], torch.max(inputs_length)
+    sequence_mask = torch.arange(0, tgt_len).to(inputs.device)
+    sequence_mask = torch.lt(sequence_mask, inputs_length.reshape(bsz, 1)).view(bsz, tgt_len, 1)
+    unpacking_index = torch.cumsum(sequence_mask.to(torch.int64).view(-1), dim=0) - 1  # 转成下标
+    return sequence_mask, unpacking_index
+def unpack_hidden_states(hidden_states, lengths):
+    bsz = lengths.shape[0]
+    sequence_mask, unpacking_index = get_sequence_mask(hidden_states, lengths)
+    hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+        bsz, torch.max(lengths), hidden_states.shape[-1]
+    )
+    hidden_states = torch.where(
+        sequence_mask, hidden_states, 0
+    )  # 3d (bsz, max_input_len, d)
+    return hidden_states
+def uniform_init(*shape):
+    t = torch.zeros(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def cdist(x, y):
+    x2 = torch.sum(x ** 2, dim=-1, keepdims=True)  # (b, 1)
+    y2 = torch.sum(y ** 2, dim=-1).reshape(1, -1)  # (1, c)
+    xy = torch.einsum('bd,cd->bc', x, y) * -2
+    return (x2 + y2 + xy).clamp(min=0).sqrt()  #  (b, c)
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * torch.finfo(dtype).min
+    return mask
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+    Returns:
+        torch.Tensor: mask
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    # NOTE this modified implementation meets onnx export requirements, but it doesn't support num_left_chunks
+    # actually this is not needed after we have inference cache implemented, will remove it later
+    pos_idx = torch.arange(size, device=device)
+    block_value = (torch.div(pos_idx, chunk_size, rounding_mode='trunc') + 1) * chunk_size
+    ret = pos_idx.unsqueeze(0) < block_value.unsqueeze(1)
+    return ret
+def add_optional_chunk_mask(xs: torch.Tensor,
+                            masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True):
+    """ Apply optional mask for encoder.
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is either [1, 25] or full context(max_len)
+            False: chunk size ~ U[1, 25]
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    return chunk_masks
+class EuclideanCodebook(nn.Module):
+    def __init__(
+            self,
+            dim,
+            codebook_size,
+            init_std=0.02,
+    ):
+        super().__init__()
+        self.init_std = init_std
+        self.dim = dim
+        self.codebook_size = codebook_size
+        embed = uniform_init(codebook_size, dim).to(torch.float32)
+        self.cluster_size = nn.Parameter(torch.ones(codebook_size))
+        self.embed_avg = nn.Parameter(embed.clone())
+        self.embed = nn.Parameter(embed)
+        del embed
+    @autocast(enabled=True, dtype=torch.float32)
+    @torch.no_grad()
+    def forward(self, x):
+        assert(len(x.shape) == 2)
+        assert(x.dtype == torch.float32)
+        embed = self.embed.detach().to(x.device)
+        dist = -cdist(x, embed)  # dist((bs*sl, d), (c, d)) --> (bs*sl, c)
+        embed_ind = dist.argmax(dim=-1)
+        quantize = embed[embed_ind]  # (bs*sl, d)
+        return quantize, embed_ind, dist
+class VectorQuantize(nn.Module):
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.config = config
+        self.codebook = EuclideanCodebook(dim=config.dim, codebook_size=config.codebook_size)
+    def forward(self, x, input_length):
+        batch_size, seq_len, _ = x.shape
+        mask, unpacking_index = get_sequence_mask(x, input_length)
+        if x.dtype != torch.float32:
+            x = x.to(torch.float32)
+        x = torch.masked_select(x, mask).reshape(-1, self.config.dim)  # (bs*sl?, d)
+        quantize, embed_ind, _ = self.codebook(x)
+        quantize = torch.index_select(quantize, 0, unpacking_index).view(batch_size, seq_len, self.config.dim)
+        quantize = torch.where(mask, quantize, 0)
+        embed_ind = torch.index_select(embed_ind.reshape(-1, 1), 0, unpacking_index).view(batch_size, seq_len, 1)
+        embed_ind = torch.where(mask, embed_ind, -1).squeeze()
+        return quantize, embed_ind
+    def get_output_from_indices(self, indices):
+        indices = indices.to(self.codebook.embed.device)
+        return self.codebook.embed[indices]
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        alpha=1.0,
+        alpha_trainable=True,
+        alpha_logscale=True,
+    ):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        """
+        super().__init__()
+        self.in_features = (
+            out_features if isinstance(out_features, list) else [out_features]
+        )
+        self.proj = LoRACompatibleLinear(in_features, out_features)
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        x = self.proj(x)
+        if self.alpha_logscale:
+            alpha = torch.exp(self.alpha)
+            beta = torch.exp(self.beta)
+        else:
+            alpha = self.alpha
+            beta = self.beta
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(
+            torch.sin(x * alpha), 2
+        )
+        return x
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        elif activation_fn == "snakebeta":
+            act_fn = SnakeBeta(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+        use_omni_attn: bool = False,
+    ):
+        super().__init__()
+        self.use_omni_attn = use_omni_attn
+        self.dim = dim
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (
+            num_embeds_ada_norm is not None
+        ) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (
+            num_embeds_ada_norm is not None
+        ) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        if self.use_omni_attn:
+            if only_cross_attention:
+                raise NotImplementedError
+            print(
+                "Use OmniWhisperAttention with flash attention. Dropout is ignored."
+            )
+            self.attn1 = OmniWhisperAttention(
+                embed_dim=dim, num_heads=num_attention_heads, causal=False
+            )
+        else:
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=(
+                    cross_attention_dim if only_cross_attention else None
+                ),
+                upcast_attention=upcast_attention,
+            )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=(
+                    cross_attention_dim if not double_self_attention else None
+                ),
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                # scale_qk=False, # uncomment this to not to use flash attention
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+        )
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+    ):
+        bsz, tgt_len, d_model = hidden_states.shape
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = (
+            cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        )
+        if self.use_omni_attn:
+            seq_len = attention_mask[:, 0, :].float().long().sum(dim=1)
+            var_len_attention_mask, unpacking_index = get_sequence_mask(
+                norm_hidden_states, seq_len
+            )
+            norm_hidden_states = torch.masked_select(
+                norm_hidden_states, var_len_attention_mask
+            )
+            norm_hidden_states = norm_hidden_states.view(torch.sum(seq_len), self.dim)
+            attn_output = self.attn1(norm_hidden_states, seq_len)
+            # unpacking
+            attn_output = torch.index_select(attn_output, 0, unpacking_index).view(
+                bsz, tgt_len, d_model
+            )
+            attn_output = torch.where(var_len_attention_mask, attn_output, 0)
+        else:
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=(
+                    encoder_hidden_states if self.only_cross_attention else None
+                ),
+                attention_mask=(
+                    encoder_attention_mask
+                    if self.only_cross_attention
+                    else attention_mask
+                ),
+                **cross_attention_kwargs,
+            )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep)
+                if self.use_ada_layer_norm
+                else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = (
+                norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+            )
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [
+                    self.ff(hid_slice)
+                    for hid_slice in norm_hidden_states.chunk(
+                        num_chunks, dim=self._chunk_dim
+                    )
+                ],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class Block1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super().__init__()
+        self.block = torch.nn.Sequential(
+            torch.nn.Conv1d(dim, dim_out, 3, padding=1),
+            torch.nn.GroupNorm(groups, dim_out),
+            nn.Mish(),
+        )
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock1D(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(
+            nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out)
+        )
+        self.block1 = Block1D(dim, dim_out, groups=groups)
+        self.block2 = Block1D(dim_out, dim_out, groups=groups)
+        self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        n_feats,
+        cfm_params,
+        n_spks=1,
+        spk_emb_dim=128,
+    ):
+        super().__init__()
+        self.n_feats = n_feats
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.solver = cfm_params.solver
+        if hasattr(cfm_params, "sigma_min"):
+            self.sigma_min = cfm_params.sigma_min
+        else:
+            self.sigma_min = 1e-4
+        self.estimator = None
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in range(1, len(t_span)):
+            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        loss = F.mse_loss(self.estimator(y, mask, mu, t.squeeze(), spks), u, reduction="sum") / (
+            torch.sum(mask) * u.shape[1]
+        )
+        return loss, y
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        causal=False,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+        gradient_checkpointing=False,
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.causal = causal
+        self.static_chunk_size = 2 * 25 * 2 # 2*input_frame_rate*token_mel_ratio
+        self.gradient_checkpointing = gradient_checkpointing
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else
+                CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            ) if self.causal else ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t)
+        t = t.to(x.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        mask = mask.to(x.dtype)
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(transformer_block),
+                        x,
+                        attn_mask,
+                        t,
+                    )
+                else:
+                    x = transformer_block(
+                        hidden_states=x,
+                        attention_mask=attn_mask,
+                        timestep=t,
+                    )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(transformer_block),
+                        x,
+                        attn_mask,
+                        t,
+                    )
+                else:
+                    x = transformer_block(
+                        hidden_states=x,
+                        attention_mask=attn_mask,
+                        timestep=t,
+                    )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(transformer_block),
+                        x,
+                        attn_mask,
+                        t,
+                    )
+                else:
+                    x = transformer_block(
+                        hidden_states=x,
+                        attention_mask=attn_mask,
+                        timestep=t,
+                    )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+    @torch.inference_mode()
+    def forward(self, estimator, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(estimator, z, t_span=t_span.to(mu.dtype), mu=mu, mask=mask, spks=spks, cond=cond)
+    def solve_euler(self, estimator, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in range(1, len(t_span)):
+            dphi_dt = estimator(x, mask, mu, t, spks, cond)
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            if self.inference_cfg_rate > 0:
+                cfg_dphi_dt = estimator(
+                    x, mask,
+                    torch.zeros_like(mu), t,
+                    torch.zeros_like(spks) if spks is not None else None,
+                    cond=cond
+                )
+                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
+                           self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def compute_loss(self, estimator, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        org_dtype = x1.dtype
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            if spks is not None:
+                spks = spks * cfg_mask.view(-1, 1)
+            if cond is not None:
+                cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = estimator(y, mask, mu, t.squeeze(), spks, cond)
+        pred = pred.float()
+        u = u.float()
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        loss = loss.to(org_dtype)
+        return loss, y
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"
+    def forward(self, x, scale=1000):
+        if x.ndim < 1:
+            x = x.unsqueeze(0)
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class Downsample1D(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+    """
+    def __init__(
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=True,
+        out_channels=None,
+        name="conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.conv = None
+        if use_conv_transpose:
+            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, inputs):
+        assert inputs.shape[1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(inputs)
+        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
+        if self.use_conv:
+            outputs = self.conv(outputs)
+        return outputs
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+class OmniWhisperAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads, causal=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.causal = causal
+    def forward(self, hidden_states: torch.Tensor, seq_len: torch.Tensor):
+        bsz, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, self.num_heads, self.head_dim)
+        cu_len = F.pad(torch.cumsum(seq_len, dim=0), (1, 0), "constant", 0).to(torch.int32)
+        max_seqlen = torch.max(seq_len).to(torch.int32).detach()
+        attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_len, cu_len, max_seqlen, max_seqlen, causal=self.causal)  # (bsz * qlen, nheads, headdim)
+        attn_output = attn_output.reshape(bsz, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class OmniWhisperTransformerLayer(nn.Module):
+    def __init__(
+        self,
+        act,
+        d_model,
+        encoder_attention_heads,
+        encoder_ffn_dim,
+        causal,
+        ln_type="LayerNorm",
+    ):
+        super().__init__()
+        self.embed_dim = d_model
+        self.self_attn = OmniWhisperAttention(
+            self.embed_dim, encoder_attention_heads, causal
+        )
+        if ln_type == "LayerNorm":
+            self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        elif ln_type == "RMSNorm":
+            self.self_attn_layer_norm = RMSNorm(self.embed_dim)
+        else:
+            raise ValueError(f"Unknown ln_type: {ln_type}")
+        self.activation_fn = act
+        self.fc1 = nn.Linear(self.embed_dim, encoder_ffn_dim)
+        self.fc2 = nn.Linear(encoder_ffn_dim, self.embed_dim)
+        if ln_type == "LayerNorm":
+            self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        elif ln_type == "RMSNorm":
+            self.final_layer_norm = RMSNorm(self.embed_dim)
+        else:
+            raise ValueError(f"Unknown ln_type: {ln_type}")
+    def forward(
+        self, hidden_states: torch.Tensor, seq_len: torch.Tensor
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states, seq_len)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        if (
+            hidden_states.dtype == torch.float16
+            or hidden_states.dtype == torch.bfloat16
+        ) and (torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+        return hidden_states
+class LongcatNextAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.max_source_positions = (config.max_audio_seconds * config.sampling_rate // config.hop_length) // config.stride_size
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.conv1 = nn.Conv1d(config.num_mel_bins, config.d_model, kernel_size=config.kernel_size, padding=1)
+        self.conv2 = nn.Conv1d(config.d_model, config.d_model, kernel_size=config.kernel_size,
+                               stride=config.stride_size, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(self.max_source_positions, config.d_model))  # 1500 * d
+        self.layers = nn.ModuleList([OmniWhisperTransformerLayer(
+            ACT2FN[config.activation_function],
+            config.d_model,
+            config.encoder_attention_heads,
+            config.encoder_ffn_dim,
+            False) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+    def forward(
+            self,
+            input_features,
+            output_length,
+    ):
+        input_features = input_features.to(self.conv1.weight.dtype)
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))  # (bs, channels, frames)
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))  # (bs, channels, frames // 2)
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)  # (bs, frams, channels)
+        bsz, tgt_len, _ = inputs_embeds.size()
+        if tgt_len < self.positional_embedding.shape[0]:
+            current_positional_embedding = self.positional_embedding[:tgt_len]
+        else:
+            current_positional_embedding = self.positional_embedding
+        hidden_states = (inputs_embeds.to(torch.float32) + current_positional_embedding).to(inputs_embeds.dtype)
+        # packing hidden states
+        attention_mask, unpacking_index = get_sequence_mask(hidden_states, output_length)
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(torch.sum(output_length),
+                                                                                self.config.d_model)
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(hidden_states, output_length)
+        hidden_states = self.layer_norm(hidden_states)
+        # unpacking
+        hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(bsz, tgt_len, self.config.d_model)
+        hidden_states = torch.where(attention_mask, hidden_states, 0)
+        return hidden_states
+class CasualConvTranspose1d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super().__init__()
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
+        self.norm = nn.GroupNorm(1, out_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+    def forward(self, hidden_states, input_length, output_dim=None):
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        bsz = input_length.shape[0]
+        if output_dim is None:
+            output_dim = hidden_states.dim()
+        if hidden_states.dim() <= 2:  # unpack sequence to 3d
+            sequence_mask, unpacking_index = get_sequence_mask(hidden_states, input_length)
+            hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(bsz, torch.max(input_length),
+                                                                                       self.in_channels)
+            hidden_states = torch.where(sequence_mask, hidden_states, 0)  # 3d (bsz, max_input_len, d)
+        hidden_states = hidden_states.transpose(2, 1)  # (N, L, C) -> (N, C, L)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.transpose(2, 1)  # (N, C, L) -> (N, L, C)
+        casual_padding_right = max(0, kernel_size - stride)
+        hidden_states = hidden_states[:, :hidden_states.shape[1] - casual_padding_right,
+                        :]
+        output_length = (input_length - 1) * stride + kernel_size - casual_padding_right
+        sequence_mask, _ = get_sequence_mask(hidden_states, output_length)
+        if output_dim <= 2:
+            hidden_states = torch.masked_select(hidden_states, sequence_mask).view(-1, self.out_channels)
+        else:
+            hidden_states = torch.where(sequence_mask, hidden_states, 0)
+            hidden_states = hidden_states[:, :torch.max(output_length), :]
+        return hidden_states, output_length
+class MelSpecRefineNet(nn.Module):
+    """
+    # post net, coarse to refined mel-spectrogram frames
+    # ref1: Autoregressive Speech Synthesis without Vector Quantization
+    # ref2: CosyVoice length_regulator.py
+    # ref3: Neural Speech Synthesis with Transformer Network https://github.com/soobinseo/Transformer-TTS/blob/master/network.py
+    """
+    def __init__(self, encoder_config, vocoder_config):
+        super().__init__()
+        self.encoder_config = encoder_config
+        self.vocoder_config = vocoder_config
+        layers = nn.ModuleList([])
+        in_channels = self.vocoder_config.num_mel_bins
+        for i, out_channels in enumerate(self.vocoder_config.channels[:-1]):
+            module = nn.Conv1d(in_channels, out_channels, 5, 1, 2)  # cosyvoice kernel=3, stride=1, pad=1
+            in_channels = out_channels
+            norm = nn.GroupNorm(1, out_channels)
+            act = nn.Mish()
+            layers.extend([module, norm, act])
+        layers.append(nn.Conv1d(in_channels, self.vocoder_config.num_mel_bins, 1, 1))  # projector
+        self.layers = nn.Sequential(*layers)
+    def compute_output_length(self, input_length):
+        output_length = input_length.to(
+            torch.float32) * self.encoder_config.hop_length / self.encoder_config.sampling_rate
+        output_length = output_length * self.vocoder_config.sampling_rate / self.vocoder_config.hop_length
+        return output_length.to(torch.int64)
+    def forward(self, coarse_mel, input_length, output_length=None):
+        bsz, _, d = coarse_mel.shape
+        assert (d == self.vocoder_config.num_mel_bins)
+        if output_length is None or not self.training:
+            output_length = self.compute_output_length(input_length)
+        coarse_mel, default_dtype = coarse_mel[:, :torch.max(input_length), :], coarse_mel.dtype
+        coarse_mel = F.interpolate(coarse_mel.to(torch.float32).transpose(1, 2).contiguous(), size=output_length.max(),
+                                   mode='nearest').to(default_dtype)
+        refined_mel = self.layers(coarse_mel).transpose(1, 2).contiguous()  # (bs, t, d)
+        coarse_mel = coarse_mel.transpose(1, 2)  # (bs, max(output_length), d)
+        refined_mel += coarse_mel  # residual conntection
+        sequence_mask, _ = get_sequence_mask(refined_mel, output_length)
+        coarse_mel = torch.where(sequence_mask, coarse_mel, 0)
+        refined_mel = torch.where(sequence_mask, refined_mel, 0)
+        return refined_mel, coarse_mel, output_length
+@dataclass
+class OmniAudioDecoderOutput(ModelOutput):
+    refined_mel: Optional[torch.FloatTensor] = None
+    coarse_mel: Optional[torch.FloatTensor] = None
+    mel_length: Optional[torch.Tensor] = None
+    hidden_states_before_dconv2: Optional[torch.FloatTensor] = None
+    output_length_before_dconv2: Optional[torch.Tensor] = None
+class LongcatNextAudioDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.vocoder_config = config.vocoder_config
+        self.max_source_positions = self.config.max_audio_seconds * self.config.sampling_rate // self.config.hop_length
+        self.dconv1 = CasualConvTranspose1d(
+            self.config.d_model,
+            self.config.d_model,
+            self.config.decoder_kernel_size,
+            self.config.avg_pooler,
+        )
+        self.register_buffer("positional_embedding", sinusoids(self.max_source_positions, self.config.d_model))
+        # causal transformer layers
+        self.layers = nn.ModuleList(
+            [OmniWhisperTransformerLayer(
+                ACT2FN[self.config.activation_function],
+                self.config.d_model,
+                self.config.decoder_attention_heads,
+                self.config.decoder_ffn_dim,
+                True  # causal
+            ) for _ in range(self.config.decoder_layers)
+            ])
+        self.layer_norm = nn.LayerNorm(self.config.d_model)
+        self.dconv2 = CasualConvTranspose1d(
+            self.config.d_model,
+            self.vocoder_config.num_mel_bins,
+            self.config.decoder_kernel_size,
+            self.config.decoder_stride_size
+        )
+        self.post_net = MelSpecRefineNet(self.config, self.vocoder_config)
+        self.gradient_checkpointing = False
+    def forward(self,
+                audio_embed,
+                input_length,
+                mel_labels=None,
+                mel_labels_length=None,
+                ):
+        assert (audio_embed.shape[-1] == self.config.d_model)
+        audio_embed = audio_embed.to(self.layer_norm.weight)  # device and type
+        audio_embed, output_length = self.dconv1(audio_embed, input_length, output_dim=3)  # (b, l*2, d_model)
+        _, tgt_len, _ = audio_embed.size()
+        if tgt_len < self.positional_embedding.shape[0]:
+            current_positional_embedding = self.positional_embedding[:tgt_len]
+        else:
+            current_positional_embedding = self.positional_embedding
+        hidden_states = (audio_embed.to(torch.float32) + current_positional_embedding).to(audio_embed.dtype)
+        # packing hidden states
+        attention_mask, _ = get_sequence_mask(hidden_states, output_length)
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(torch.sum(output_length), self.config.d_model)
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(hidden_states, output_length)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states_before_dconv2 = hidden_states
+        output_length_before_dconv2 = output_length
+        coarse_mel, output_length = self.dconv2(hidden_states, output_length, output_dim=3)
+        refined_mel, coarse_mel, mel_labels_length = self.post_net(coarse_mel, output_length, mel_labels_length)
+        return OmniAudioDecoderOutput(
+            refined_mel=refined_mel,
+            coarse_mel=coarse_mel,
+            mel_length=mel_labels_length,
+            hidden_states_before_dconv2=hidden_states_before_dconv2,
+            output_length_before_dconv2=output_length_before_dconv2,
+        )
+class LongcatNextAudioVQBridger(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.gradient_checkpointing = False
+        self.intermediate_dim = self.config.d_model * self.config.avg_pooler
+        self.gate_proj = nn.Conv1d(self.config.d_model, self.intermediate_dim, self.config.avg_pooler, self.config.avg_pooler, bias=False)
+        self.up_proj = nn.Conv1d(self.config.d_model, self.intermediate_dim, self.config.avg_pooler, self.config.avg_pooler, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_dim, self.intermediate_dim, bias=False)
+        self.act_fn = ACT2FN['silu']
+        self.layer_norm = nn.LayerNorm(self.intermediate_dim)
+        self.proj_decoder = nn.Linear(self.intermediate_dim, self.config.d_model)
+        self.vq_list = nn.ModuleList([])
+        for idx, codebook_size in enumerate(self.config.vq_config.codebook_sizes):
+            vq_config = copy.deepcopy(self.config.vq_config)
+            vq_config.dim = self.intermediate_dim
+            vq_config.codebook_size = codebook_size
+            self.vq_list.append(VectorQuantize(vq_config))
+    def rvq_op(self, inputs, output_length):
+        def rvq_layer_op(vq_layer, residual_encoding, output_length):
+            q_v_i, code_ids_i = vq_layer(residual_encoding, output_length)
+            residual_encoding = residual_encoding.float() - q_v_i.float()
+            residual_encoding = residual_encoding.to(inputs.dtype)
+            return residual_encoding, code_ids_i
+        cmt_loss, residual_encoding = 0, inputs
+        code_ids_list = []
+        for i, vq_layer in enumerate(self.vq_list):
+            residual_encoding, code_ids_i = rvq_layer_op(vq_layer, residual_encoding, output_length)
+            code_ids_list.append(code_ids_i)
+        return torch.stack(code_ids_list, -1)
+    def forward(self, x, output_length):
+        batch_size, _, _ = x.shape
+        output_length = output_length.to(x.device)
+        if x.shape[1] % self.config.avg_pooler != 0:
+            x = F.pad(x, (0, 0, 0, self.config.avg_pooler - x.shape[1] % self.config.avg_pooler), "constant", 0)
+        xt = x.permute(0, 2, 1)
+        g = self.gate_proj(xt).permute(0, 2, 1)  # (bs, sl//poolersizre+1, d*2)
+        u = self.up_proj(xt).permute(0, 2, 1)
+        x = x.reshape(batch_size, -1, self.intermediate_dim)  # (bs, sl//poolersizre+1, d*2)
+        c = self.down_proj(self.act_fn(g) * u)
+        res = self.layer_norm(c + x)
+        valid_mask, _ = get_sequence_mask(res, output_length)
+        code_ids = self.rvq_op(res, output_length)
+        code_ids = torch.masked_select(code_ids, valid_mask).reshape(-1, len(self.vq_list))  # (sum(valid_sequence_length), vq_num)
+        return code_ids
+    @torch.no_grad()
+    def decode(self, code_ids):
+        vq_num = code_ids.shape[-1]
+        res = sum(self.vq_list[i].get_output_from_indices(code_ids[:, i]).float() for i in range(vq_num-1,-1,-1)).to(self.proj_decoder.weight)
+        decoder_emb = self.proj_decoder(res.to(self.proj_decoder.weight))
+        return decoder_emb
+    @torch.no_grad()
+    def recover(self, code_ids):
+        vq_num = code_ids.shape[-1]
+        res = sum(self.vq_list[i].get_output_from_indices(code_ids[:, i]).float() for i in range(vq_num-1,-1,-1)).to(self.proj_decoder.weight)
+        return res
+class FlowmatchingPrenet(nn.Module):
+    def __init__(
+        self,
+        input_feat_dim,
+        out_feat_dim,
+        d_model,
+        attention_heads,
+        ffn_dim,
+        nlayers,
+        activation_function,
+        max_source_positions,
+        target_mel_length_scale_ratio,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.target_mel_length_scale_ratio = target_mel_length_scale_ratio
+        self.gradient_checkpointing = False
+        self.register_buffer(
+            "positional_embedding", sinusoids(max_source_positions, d_model)
+        )
+        self.in_mlp = nn.Sequential(
+            nn.Linear(input_feat_dim, d_model * 4),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model),
+        )
+        self.transformer_layers = nn.ModuleList(
+            [
+                OmniWhisperTransformerLayer(
+                    act=ACT2FN[activation_function],
+                    d_model=d_model,
+                    encoder_attention_heads=attention_heads,
+                    encoder_ffn_dim=ffn_dim,
+                    causal=True,  # causal
+                    ln_type="RMSNorm",
+                )
+                for _ in range(nlayers)
+            ]
+        )
+        self.final_norm = RMSNorm(self.d_model)
+        self.out_proj = nn.Linear(d_model, out_feat_dim, bias=False)
+    def compute_output_length(self, input_length):
+        output_length = input_length.float() * self.target_mel_length_scale_ratio
+        return output_length.to(torch.int64)
+    def forward(self, input_feat, input_length, output_length=None):
+        """
+        Args:
+            input_feat: [B, T, input_feat_dim]
+            input_length: [B]
+            output_length: [B]
+        """
+        if output_length is None or not self.training:
+            output_length = self.compute_output_length(input_length)
+        input_feat = input_feat[:, : input_length.max(), :]  # [B, T, D]
+        orig_dtype = input_feat.dtype
+        input_feat = F.interpolate(
+            input=input_feat.to(torch.float32).transpose(1, 2).contiguous(),
+            size=output_length.max(),
+            mode="nearest",
+        ).to(orig_dtype)
+        input_feat = input_feat.transpose(1, 2).contiguous()  # [B, T, D]
+        hidden_states = self.in_mlp(input_feat)
+        # packing hidden states
+        bsz, tgt_len, d_model = hidden_states.shape
+        attention_mask, unpacking_index = get_sequence_mask(
+            hidden_states, output_length
+        )
+        hidden_states = torch.masked_select(hidden_states, attention_mask).view(
+            torch.sum(output_length), self.d_model
+        )
+        for idx, encoder_layer in enumerate(self.transformer_layers):
+            hidden_states = encoder_layer(hidden_states, output_length)
+        # unpacking
+        hidden_states = torch.index_select(hidden_states, 0, unpacking_index).view(
+            bsz, tgt_len, d_model
+        )
+        hidden_states = torch.where(attention_mask, hidden_states, 0)
+        hidden_states = self.final_norm(hidden_states)
+        output = self.out_proj(hidden_states)
+        return output, output_length
+@dataclass
+class OmniAudioFlowMatchingDecoderOutput(ModelOutput):
+    flow_matching_mel: Optional[torch.FloatTensor] = None
+    flow_matching_mel_lengths: Optional[torch.FloatTensor] = None
+class LongcatNextAudioFlowMatchingDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config.flow_matching_config
+        self.in_channels = self.config.in_channels
+        self.spk_emb_dim = self.config.spk_emb_dim
+        self.diffusion_steps = self.config.diffusion_steps
+        self.cal_mel_mae = self.config.cal_mel_mae
+        self.forward_step = -1
+        self.prenet = FlowmatchingPrenet(
+            input_feat_dim=self.config.prenet_in_dim,
+            out_feat_dim=self.config.prenet_out_dim,
+            d_model=self.config.prenet_d_model,
+            attention_heads=self.config.prenet_attention_heads,
+            ffn_dim=self.config.prenet_ffn_dim,
+            nlayers=self.config.prenet_nlayers,
+            activation_function=self.config.prenet_activation_function,
+            max_source_positions=self.config.prenet_max_source_positions,
+            target_mel_length_scale_ratio=self.config.prenet_target_mel_length_scale_ratio,
+        )
+        self.conditional_decoder = ConditionalDecoder(
+            in_channels=self.in_channels * 2 + self.spk_emb_dim,
+            out_channels=self.in_channels,
+            causal=True,
+            channels=self.config.channels,
+            dropout=self.config.dropout,
+            attention_head_dim=self.config.attention_head_dim,
+            n_blocks=self.config.n_blocks,
+            num_mid_blocks=self.config.num_mid_blocks,
+            num_heads=self.config.num_heads,
+            act_fn=self.config.act_fn,
+        )
+        self.cfm = ConditionalCFM(
+            in_channels=self.in_channels,
+            cfm_params=self.config.cfm_params,
+            n_spks=0,
+            spk_emb_dim=self.spk_emb_dim,
+        )
+    def unpack_hidden_states(self, hidden_states, output_length):
+        unpacked = unpack_hidden_states(hidden_states, output_length)
+        return unpacked, output_length
+    def forward(
+        self, refined_mel, input_length, mel_labels=None, mel_labels_length=None
+    ):
+        """
+        :param refined_mel: [bs,  max_input_len, mel_bin]
+        :param input_length:  [batch_size]
+        :param refined_mel: [bs, mel_bin, max_input_len]
+        :return:
+        """
+        self.forward_step += 1
+        orig_dtype = refined_mel.dtype
+        prenet_mae_metric = torch.tensor(0.0).to(refined_mel.device)
+        prenet_regression_loss = torch.tensor(0.0).to(refined_mel.device)
+        if self.prenet is not None:
+            refined_mel = refined_mel[:, : torch.max(input_length), :]
+            if mel_labels_length is None:
+                mel_labels_length = self.prenet.compute_output_length(input_length)
+            refined_mel, input_length = self.prenet(
+                refined_mel, input_length, mel_labels_length
+            )
+        float_dtype = refined_mel.dtype
+        refined_mel = refined_mel.float()
+        input_length = input_length.long()
+        refined_mel = refined_mel[:, : torch.max(input_length), :]
+        sequence_mask, unpacking_index = get_sequence_mask(refined_mel, input_length)
+        refined_mel = refined_mel.transpose(1, 2)  # (bs, mel_bin, max_input_len)
+        sequence_mask = sequence_mask.transpose(2, 1)  # (bs, 1, sl)
+        fm_mel = self.cfm.forward(
+            estimator=self.conditional_decoder,
+            mu=refined_mel.to(float_dtype),
+            mask=sequence_mask.float(),
+            n_timesteps=self.diffusion_steps,
+        )
+        return OmniAudioFlowMatchingDecoderOutput(
+            flow_matching_mel=fm_mel.transpose(1, 2),
+            flow_matching_mel_lengths=mel_labels_length,
+        )
+@torch.no_grad()
+def decode_wave_vocoder2(response, vocoder, audio_tokenizer):
+    response_len = (response[:,:,0] == audio_tokenizer.config.audio_config.vq_config.codebook_sizes[0]).long().argmax(dim=1)
+    valid_response_list = [response[i, :response_len[i], :] for i in range(response.shape[0]) if int(response_len[i])>0]
+    if len(valid_response_list)==0:
+        return []
+    flatten_response = torch.cat(valid_response_list, dim=0) if len(valid_response_list)>1 else valid_response_list[0]
+    valid_response_len = response_len[response_len>0]
+    ret = audio_tokenizer.decode(flatten_response.view(-1,response.shape[-1]),
+                bridge_length=valid_response_len)
+    batch_size = response.shape[0]
+    valid_start = 0
+    r = []
+    for i in range(batch_size):
+        if response_len[i]==0:
+            r.append(None)
+            continue
+        if isinstance(ret, torch.Tensor):
+            r.append(ret[valid_start:valid_start+1])
+            valid_start+=1
+            continue
+        decode_wave = vocoder.decode(ret.flow_matching_mel[valid_start ][:ret.flow_matching_mel_lengths[valid_start ], :].transpose(0, 1).to(torch.float32).unsqueeze(0))
+        r.append(decode_wave.cpu())
+        valid_start+=1
+    return r
+@torch.no_grad()
+def decode_save_concat2(response_list, vocoder, model, path, sampling_rate=16000, wave_concat_overlap=800):
+    wave_list = []
+    for response in response_list:
+        wave_list.extend([wave_i for wave_i in decode_wave_vocoder2(response, vocoder, model) if wave_i is not None])
+    new_wave_list = [wave_list[0]]
+    for w in wave_list[1:]:
+        if new_wave_list[-1].shape[1] > wave_concat_overlap and w.shape[1] > wave_concat_overlap:
+            new_wave_list.append((new_wave_list[-1][:, -wave_concat_overlap:] * torch.linspace(1.0, 0.0, wave_concat_overlap, device=new_wave_list[-1].device)[None, :]
+                                + w[:, :wave_concat_overlap] * torch.linspace(0.0, 1.0, wave_concat_overlap, device=new_wave_list[-1].device)[None, :]))
+        new_wave_list.append(w)
+    full_wave = torch.cat(new_wave_list, dim=1) if len(new_wave_list) > 1 else new_wave_list[0]
+    torchaudio.save(path, full_wave, sampling_rate)
+class LongcatNextAudioTokenizer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.audio_model = LongcatNextAudioEncoder(config.audio_config)
+        self.audio_bridge_model = LongcatNextAudioVQBridger(config.audio_config)
+        self.audio_decoder = LongcatNextAudioDecoder(config.audio_config)
+        self.audio_flow_matching_decoder = LongcatNextAudioFlowMatchingDecoder(config.audio_config)
+        self.cosy24kvocoder = None
+    @torch.no_grad()
+    def encode(self, x, encoder_length: Optional[torch.Tensor] = None, bridge_length: Optional[torch.Tensor] = None):
+        audio_emb = self.audio_model(x, encoder_length)
+        audio_tokens = self.audio_bridge_model(audio_emb, bridge_length)
+        return audio_tokens
+    @torch.no_grad()
+    def decode(self, audio_ids, bridge_length: Optional[torch.Tensor] = None):
+        audio_emb = self.audio_bridge_model.decode(audio_ids)
+        audio_dec = self.audio_decoder(
+            audio_emb.to(next(self.audio_decoder.parameters())), bridge_length
+        )
+        if self.config.audio_config.flow_matching_config.use_hidden_states_before_dconv2:
+            hidden_states, hidden_states_length = (
+                self.audio_flow_matching_decoder.unpack_hidden_states(
+                    audio_dec.hidden_states_before_dconv2,
+                    audio_dec.output_length_before_dconv2,
+                )
+            )
+            audio_flow_matching_decoder_ret = self.audio_flow_matching_decoder(
+                hidden_states, hidden_states_length
+            )
+        else:
+            audio_flow_matching_decoder_ret = self.audio_flow_matching_decoder(
+                audio_dec.refined_mel, audio_dec.mel_length
+            )
+        return audio_flow_matching_decoder_ret
+    @torch.no_grad()
+    def lazy_decode_and_save(self, audio_ids, sampling_rate, wave_concat_overlap, save_path):
+        if self.cosy24kvocoder is None:
+            print("lazy load cosy24kvocoder ...")
+            device = next(self.parameters()).device
+            self.cosy24kvocoder = Cosy24kVocoder.from_pretrained(self.config.audio_config.cosy24kvocoder_config.weight_path).to(device)
+        if audio_ids[-1, 0] != self.config.audio_config.vq_config.codebook_sizes[0]: # exceed max_new_tokens
+            audio_ids = F.pad(audio_ids, (0, 0, 0, 1), value=self.config.audio_config.vq_config.codebook_sizes[0])
+        audio_end_pos = [-1] + (audio_ids[:, 0] == self.config.audio_config.vq_config.codebook_sizes[0]).nonzero().view(-1).tolist()
+        audio_ids_chunk = []
+        for i in range(len(audio_end_pos) - 1):
+            start = audio_end_pos[i] + 1
+            end = audio_end_pos[i+1] + 1
+            audio_ids_chunk.append(audio_ids[start:end].unsqueeze(0))
+        audio_ids = audio_ids_chunk
+        decode_save_concat2(
+            response_list=audio_ids,
+            vocoder=self.cosy24kvocoder,
+            model=self,
+            path=save_path,
+            sampling_rate=sampling_rate,
+            wave_concat_overlap=wave_concat_overlap,
+        )

modular_longcat_next_visual.py ADDED Viewed

	@@ -0,0 +1,1077 @@

+from typing import Iterable, Optional, Tuple
+import numpy as np
+from safetensors.torch import load_file
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.amp import autocast
+from torch.nn import functional as F
+from einops import rearrange
+from flash_attn import flash_attn_varlen_func
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2RMSNorm,
+    Qwen2_5_VisionTransformerPretrainedModel,
+)
+from transformers.utils import logging
+from .image_refiner import (
+    ImageRefinerContainer,
+    RefinerImageProcessor,
+    RefinerPipeline,
+    de_transform,
+    tensor2pil,
+)
+from .refiner_modules import FlowMatchEulerDiscreteScheduler
+logger = logging.get_logger(__name__)
+def uniform_init(*shape):
+    t = torch.zeros(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+class VQEmbedding(nn.Module):
+    """VQ embedding module with ema update."""
+    def __init__(self, n_embed, embed_dim, ema=True, decay=0.99, restart_unused_codes=True, eps=1e-5, init_std=0.02):
+        super().__init__()
+        self.ema = ema
+        self.decay = decay
+        self.eps = eps
+        self.restart_unused_codes = restart_unused_codes
+        self.n_embed = n_embed
+        self.init_std = init_std
+        assert self.ema
+        embed = uniform_init(n_embed + 1, embed_dim).to(torch.float32)
+        self.embed = nn.Parameter(embed)
+        self.embed_ema = nn.Parameter(embed[:-1, :].clone())
+        self.cluster_size_ema = nn.Parameter(torch.ones(n_embed))
+        del embed
+        _ = [p.requires_grad_(False) for p in self.parameters()]
+    @torch.no_grad()
+    def compute_distances(self, inputs):
+        codebook_t = self.embed[:-1, :].t()
+        (embed_dim, _) = codebook_t.shape
+        inputs_shape = inputs.shape
+        assert inputs_shape[-1] == embed_dim
+        inputs_flat = inputs.reshape(-1, embed_dim)
+        inputs_norm_sq = inputs_flat.pow(2.).sum(dim=1, keepdim=True)
+        codebook_t_norm_sq = codebook_t.pow(2.).sum(dim=0, keepdim=True)
+        distances = torch.addmm(
+            inputs_norm_sq + codebook_t_norm_sq,
+            inputs_flat,
+            codebook_t,
+            alpha=-2.0,
+        )
+        distances = distances.reshape(*inputs_shape[:-1], -1)  # [B, h, w, n_embed or n_embed+1]
+        return distances
+    @torch.no_grad()
+    def find_nearest_embedding(self, inputs):
+        distances = self.compute_distances(inputs)  # [B, h, w, n_embed or n_embed+1]
+        embed_idxs = distances.argmin(dim=-1)  # use padding index or not
+        return embed_idxs
+    @autocast('cuda', enabled=True, dtype=torch.float32)
+    @torch.no_grad()
+    def forward(self, inputs):
+        if inputs.dtype != torch.float32:
+            inputs = inputs.to(torch.float32)
+        embed_idxs = self.find_nearest_embedding(inputs)
+        embeds = self.embed[embed_idxs]
+        return embeds, embed_idxs
+class RQBottleneck(nn.Module):
+    """
+    Quantization bottleneck via Residual Quantization.
+    Arguments:
+        latent_shape (Tuple[int, int, int]): the shape of latents, denoted (H, W, D)
+        code_shape (Tuple[int, int, int]): the shape of codes, denoted (h, w, d)
+        n_embed (int, List, or Tuple): the number of embeddings (i.e., the size of codebook)
+            If isinstance(n_embed, int), the sizes of all codebooks are same.
+        shared_codebook (bool): If True, codebooks are shared in all location. If False,
+            uses separate codebooks along the ``depth'' dimension. (default: False)
+        restart_unused_codes (bool): If True, it randomly assigns a feature vector in the curruent batch
+            as the new embedding of unused codes in training. (default: True)
+    """
+    def __init__(self,
+                 latent_shape,
+                 code_shape,
+                 n_embed,
+                 decay=0.99,
+                 shared_codebook=False,
+                 restart_unused_codes=True,
+                 commitment_loss='cumsum'
+                 ):
+        super().__init__()
+        if not len(code_shape) == len(latent_shape) == 3:
+            raise ValueError("incompatible code shape or latent shape")
+        if any([y % x != 0 for x, y in zip(code_shape[:2], latent_shape[:2])]):
+            raise ValueError("incompatible code shape or latent shape")
+        #residual quantization does not divide feature dims for quantization.
+        embed_dim = np.prod(latent_shape[:2]) // np.prod(code_shape[:2]) * latent_shape[2]
+        self.latent_shape = torch.Size(latent_shape)
+        self.code_shape = torch.Size(code_shape)
+        self.shape_divisor = torch.Size([latent_shape[i] // code_shape[i] for i in range(len(latent_shape))])
+        self.shared_codebook = shared_codebook
+        if self.shared_codebook:
+            if isinstance(n_embed, Iterable) or isinstance(decay, Iterable):
+                raise ValueError("Shared codebooks are incompatible \
+                                    with list types of momentums or sizes: Change it into int")
+        self.restart_unused_codes = restart_unused_codes
+        self.n_embed = n_embed if isinstance(n_embed, Iterable) else [n_embed for _ in range(self.code_shape[-1])]
+        self.decay = decay if isinstance(decay, Iterable) else [decay for _ in range(self.code_shape[-1])]
+        assert len(self.n_embed) == self.code_shape[-1]
+        assert len(self.decay) == self.code_shape[-1]
+        if self.shared_codebook:
+            codebook0 = VQEmbedding(self.n_embed[0],
+                                    embed_dim,
+                                    decay=self.decay[0],
+                                    restart_unused_codes=restart_unused_codes,
+                                    ).to(torch.float32)
+            self.codebooks = nn.ModuleList([codebook0 for _ in range(self.code_shape[-1])])
+        else:
+            codebooks = [VQEmbedding(self.n_embed[idx],
+                                     embed_dim,
+                                     decay=self.decay[idx],
+                                     restart_unused_codes=restart_unused_codes,
+                                     ).to(torch.float32) for idx in range(self.code_shape[-1])]
+            self.codebooks = nn.ModuleList(codebooks)
+        self.commitment_loss = commitment_loss
+    def to_code_shape(self, x):
+        (B, H, W, D) = x.shape
+        (rH, rW, _) = self.shape_divisor
+        x = x.reshape(B, H//rH, rH, W//rW, rW, D)
+        x = x.permute(0, 1, 3, 2, 4, 5)
+        x = x.reshape(B, H//rH, W//rW, -1)
+        return x
+    def to_latent_shape(self, x):
+        (B, h, w, _) = x.shape
+        (_, _, D) = self.latent_shape
+        (rH, rW, _) = self.shape_divisor
+        x = x.reshape(B, h, w, rH, rW, D)
+        x = x.permute(0, 1, 3, 2, 4, 5)
+        x = x.reshape(B, h*rH, w*rW, D)
+        return x
+    def quantize(self, x):
+        r"""
+        Return list of quantized features and the selected codewords by the residual quantization.
+        The code is selected by the residuals between x and quantized features by the previous codebooks.
+        Arguments:
+            x (Tensor): bottleneck feature maps to quantize.
+        Returns:
+            quant_list (list): list of sequentially aggregated and quantized feature maps by codebooks.
+            codes (LongTensor): codewords index, corresponding to quants.
+        Shape:
+            - x: (B, h, w, embed_dim)
+            - quant_list[i]: (B, h, w, embed_dim)
+            - codes: (B, h, w, d)
+        """
+        B, h, w, embed_dim = x.shape
+        ori_dtype = x.dtype
+        x = x.to(torch.float32)
+        self.codebooks = self.codebooks.to(torch.float32)
+        residual_feature = x.detach().clone()
+        quant_list = []
+        code_list = []
+        aggregated_quants = torch.zeros_like(x)
+        for i in range(self.code_shape[-1]):
+            quant, code = self.codebooks[i](residual_feature)
+            residual_feature.sub_(quant)
+            aggregated_quants.add_(quant)
+            quant_list.append(aggregated_quants.clone().to(dtype=ori_dtype))
+            code_list.append(code.unsqueeze(-1))
+        codes = torch.cat(code_list, dim=-1)
+        return quant_list, codes
+    def forward(self, x):
+        x_reshaped = self.to_code_shape(x)
+         # 强制使用float32精度来执行
+        quant_list, codes = self.quantize(x_reshaped)
+        # quant_list, codes = self.quantize(x_reshaped)
+        commitment_loss = self.compute_commitment_loss(x_reshaped, quant_list)
+        quants_trunc = self.to_latent_shape(quant_list[-1])
+        quants_trunc = x + (quants_trunc - x).detach()
+        '''
+        if self.shared_codebook:
+            cur_len = codes.view(-1).shape[0]
+            self.codebook_used[:-cur_len] = self.codebook_used[cur_len:].clone()
+            self.codebook_used[-cur_len:] = codes.view(-1)
+            codebook_usage = len(torch.unique(self.codebook_used)) / self.n_embed[0]
+        else:
+            # info|code: torch.Size([10, 16, 16, 4])
+            codebook_usage = 0
+            for idx in range(self.code_shape[-1]):
+                cur_len = codes[..., idx].view(-1).shape[0]
+                self.codebook_used[idx, :-cur_len] = self.codebook_used[idx, cur_len:].clone()
+                self.codebook_used[idx, -cur_len:] = codes[..., idx].view(-1)
+                codebook_usage += len(torch.unique(self.codebook_used[idx]))
+            codebook_usage /= (self.n_embed[0] * self.code_shape[-1])
+        '''
+        codebook_usage = 0
+        # (vq_loss, commit_loss, entropy_loss, codebook_usage) # 格式对齐
+        codebook_loss = [0, commitment_loss, 0, codebook_usage]
+        return quants_trunc, codebook_loss, codes
+    def compute_commitment_loss(self, x, quant_list):
+        r"""
+        Compute the commitment loss for the residual quantization.
+        The loss is iteratively computed by aggregating quantized features.
+        """
+        loss_list = []
+        for idx, quant in enumerate(quant_list):
+            partial_loss = (x-quant.detach()).pow(2.0).mean()
+            loss_list.append(partial_loss)
+        commitment_loss = torch.mean(torch.stack(loss_list))
+        return commitment_loss
+class Qwen2_5_VisionRotaryEmbedding_Modified(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        # self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int, device: torch.device) -> torch.Tensor:
+        self.inv_freq = self.inv_freq.to(device)
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class VisualEncoder(Qwen2_5_VisionTransformerPretrainedModel):
+    def __init__(self, config):
+        config._attn_implementation = 'flash_attention_2'
+        super().__init__(config)
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding_Modified(config.hidden_size // config.num_heads // 2)
+        self.gradient_checkpointing = False
+        self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
+        self.merge_size = config.merge_size if hasattr(config, 'merge_size') else 2
+        del self.merger # register visual.merger in visual_bridge_model
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.down_proj.weight.dtype
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.down_proj.weight.device
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size, device=grid_thw.device)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+        require_window_index: bool = False,
+    ):
+        '''
+        pixel_values.shape=[NumOfPatches, 1176]
+        grid_thw.shape=[NumOfSamples, 3]. [grid_t,grid_h,grid_w]
+        '''
+        hidden_states = pixel_values.to(torch.bfloat16)
+        grid_thw = grid_thw.to(pixel_values.device)
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(blk.__call__, hidden_states, cu_seqlens_now, None, position_embeddings)
+            else:
+                hidden_states = blk(
+                    hidden_states,
+                    cu_seqlens=cu_seqlens_now,
+                    position_embeddings=position_embeddings,
+                )
+        if require_window_index:
+            return hidden_states, window_index
+        return hidden_states
+class OmniVisualBridge(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.merge_size = self.config.merge_size if hasattr(self.config, 'merge_size') else 2
+        self.hidden_size = self.config.hidden_size * (self.merge_size**2)
+        self.window_index = self.config.window_size
+        self.ln_q = Qwen2RMSNorm(self.config.hidden_size, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, self.config.out_hidden_size),
+        )
+    def forward(self, x: torch.Tensor, window_index) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        reverse_indices = torch.argsort(window_index)
+        x = x[reverse_indices, :]
+        return x
+class VisualQuantizer(nn.Module):
+    def __init__(self, quantizer_config):
+        super().__init__()
+        self.config = quantizer_config
+        self.depth = self.config.depth
+        self.decay = self.config.decay
+        self.codebook_size = self.config.codebook_size
+        self.codebook_dim = self.config.codebook_dim
+        self.shared_codebook = self.config.shared_codebook
+        self.restart_unused_codes = self.config.restart_unused_codes
+        self.in_channels = self.config.in_channels
+        self.vq_loss_ratio = self.config.vq_loss_ratio
+        self.entropy_loss_ratio = self.config.entropy_loss_ratio
+        self.commit_loss_ratio = self.config.commit_loss_ratio
+        code_h_w = int(448 / 14)
+        latent_shape = [code_h_w, code_h_w, self.codebook_dim]
+        code_shape = [code_h_w, code_h_w, self.depth]
+        self.quantize = RQBottleneck(
+            latent_shape=latent_shape,
+            code_shape=code_shape,
+            n_embed=self.codebook_size,
+            decay=self.decay,
+            shared_codebook=self.shared_codebook,
+            restart_unused_codes=self.restart_unused_codes,
+        )
+        if self.config.quant_conv:
+            self.quant_conv = nn.Sequential(
+                nn.LayerNorm(self.in_channels),
+                nn.Linear(self.in_channels, self.in_channels),
+                nn.GELU(),
+                nn.Linear(self.in_channels, self.codebook_dim)
+            )
+        else:
+            self.quant_conv = None
+    def encode(self, x):
+        L, D = x.shape
+        to_qnt_feat = x.clone()
+        to_qnt_feat = to_qnt_feat.unsqueeze(0) # [L, D] -> [1, L, D]
+        N = 1
+        if self.quant_conv is not None:
+            to_qnt_feat = self.quant_conv(to_qnt_feat)
+        # quantizer needs nchw format. N,L,d -> N,1,L,d -> N,d,1,L
+        to_qnt_feat = to_qnt_feat.reshape(N, 1, L, self.codebook_dim).permute(0,3,1,2)
+        if self.config.quantizer_type == "rq":
+            to_qnt_feat = to_qnt_feat.permute(0, 2, 3, 1).contiguous() # N,d,1,L -> N,1,L,d
+            quant, emb_loss, info = self.quantize(to_qnt_feat)
+            info = info.reshape(-1, info.shape[-1]) # n,h,w,lv -> n*h*w,lv
+            info = [None, None, info]
+            quant = quant.permute(0, 3, 1, 2).contiguous() # N,1,L,d -> N,d,1,L
+        else:
+            quant, emb_loss, info = self.quantize(to_qnt_feat)
+        return quant, emb_loss, info, x.detach()
+    def forward(self, x):
+        quant, (vq_loss, commit_loss, entropy_loss, codebook_usage), (perplexity, min_encodings, min_encoding_indices), align_feature = \
+            self.encode(x)
+        return min_encoding_indices
+class MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class DecoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.mlp = MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.visual_embedding_layer_intermediate_size,
+            hidden_act=config.visual_embedding_layer_hidden_act,
+        )
+        self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        residual = hidden_states
+        hidden_states = self.pre_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class VisualEmbeddingBridge(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.pre_buffer = DecoderLayer(config)
+    def forward(self, embeding):
+        return self.pre_buffer(embeding)
+class VisualVQBridge(nn.Module):
+    def __init__(self, visual_config):
+        super().__init__()
+        self.bridge = OmniVisualBridge(visual_config)
+        self.quantizer = VisualQuantizer(visual_config.vq_config)
+    def forward(
+        self,
+        visual_embed: torch.Tensor,
+        window_index: torch.Tensor,
+    ):
+        visual_embed = self.bridge(visual_embed, window_index)
+        indices = self.quantizer(visual_embed)
+        return indices
+class LongcatNextVisualTokenizer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.visual_model = VisualEncoder(config.visual_config)
+        self.visual_bridge_model = VisualVQBridge(config.visual_config)
+        self.visual_embedding_layer = VisualEmbeddingBridge(config)
+        self.image_decoder = None
+        self._refiner_pipeline = None
+    @torch.no_grad()
+    def encode(self, pixel_values: torch.Tensor, visual_grid_thw: torch.Tensor):
+        visual_embed, window_index = self.visual_model(pixel_values, grid_thw=visual_grid_thw, require_window_index=True)
+        indices = self.visual_bridge_model(visual_embed, window_index)
+        return indices
+    @torch.no_grad()
+    def lazy_decode_and_save(self, visual_ids, tokens_h, tokens_w, save_path):
+        device = next(self.parameters()).device
+        if self.image_decoder is None:
+            print("lazy load image_decoder / image_refiner / _refiner_pipeline ...")
+            vdc = self.config.visual_config.visual_decoder_config
+            self.image_decoder = VisionTransformerDecoder.from_pretrained(
+                vdc.image_decoder_config,
+                vdc.weight_path,
+            ).to(device=device, dtype=torch.bfloat16)
+            image_refiner = ImageRefinerContainer.from_pretrained(vdc, vdc.weight_path).to(device=device, dtype=torch.bfloat16)
+            sc = vdc.scheduler_config
+            scheduler = FlowMatchEulerDiscreteScheduler(
+                num_train_timesteps=sc.num_train_timesteps,
+                dynamic_time_shift=sc.dynamic_time_shift)
+            self._refiner_pipeline = RefinerPipeline(
+                vae=image_refiner.vae,
+                transformer=image_refiner.base_transformer,
+                scheduler=scheduler,
+                cond_proj=image_refiner.cond_proj,
+            )
+            self._refiner_pipeline.set_progress_bar_config(disable=False)
+        data = torch.as_tensor(visual_ids, dtype=torch.long)
+        if data.ndim == 1:
+            data = data.view(-1, len(self.config.visual_config.vq_config.codebook_sizes))
+        if data.ndim == 2:
+            data = data.unsqueeze(0)
+        batch_size = data.shape[0]
+        quant_features = None
+        for idx in range(len(self.config.visual_config.vq_config.codebook_sizes)):
+            embed = self.visual_bridge_model.quantizer.quantize.codebooks[idx].embed
+            feat = embed[data[..., idx].to(embed.device)]
+            quant_features = feat if quant_features is None else quant_features + feat
+        quant_features = quant_features.to(device)
+        # tokens_h/tokens_w are the merged grid; expand to the full (unmerged) grid
+        s = self.image_decoder.spatial_merge_size
+        grid_thw_list = [(1, tokens_h * s, tokens_w * s)]
+        grid_thw_batch = list(grid_thw_list) * batch_size
+        image_mean = [0.48145466, 0.4578275, 0.40821073]
+        image_std = [0.26862954, 0.26130258, 0.27577711]
+        emb_2d = quant_features.reshape(-1, quant_features.shape[-1]).contiguous()
+        device_type = "cuda" if str(device).startswith("cuda") else str(device)
+        with torch.amp.autocast(device_type=device_type, enabled=True, dtype=torch.float32):
+            decoder_out = self.image_decoder(emb_2d, grid_thw_batch, return_pixel_features=False)
+        decoded_tensors = decoder_out.get("images") or []
+        decoded_images = [tensor2pil(t, image_mean, image_std) for t in decoded_tensors]
+        decoded_path = save_path.replace(".png", "_decoded.png")
+        # decoded_images[0].save(decoded_path)
+        ref_input = []
+        for t in decoded_tensors:
+            img_01 = de_transform(t, mean=image_mean, std=image_std, rescale_factor=1 / 255)
+            img_norm = RefinerImageProcessor.normalize(img_01)
+            ref_input.append(img_norm.squeeze(0).to(device))
+        generators = [torch.Generator(device=device).manual_seed(42 + b) for b in range(batch_size)]
+        out = self._refiner_pipeline(
+            encoder_hidden_states=quant_features,
+            grid_thw_list=grid_thw_list,
+            image=ref_input,
+            generator=generators[0] if batch_size == 1 else generators,
+            output_type="pil",
+            return_dict=True,
+        )
+        refined_images = out.images
+        refined_path = save_path.replace(".png", "_refined.png")
+        refined_images[0].save(refined_path)
+        return [refined_path]
+# ---------------------------------------------------------------------------
+# Vision Transformer Decoder
+# ---------------------------------------------------------------------------
+def _rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+class VisionRoPE2D(nn.Module):
+    """2D Rotary Position Embedding for Q/K in vision decoder attention."""
+    def __init__(self, theta: float = 10000.0):
+        super().__init__()
+        self.theta = theta
+    def _rope_half(self, x_half, pos_1d, theta):
+        BH, T, d_half = x_half.shape
+        idx = torch.arange(0, d_half, 2, device=x_half.device, dtype=torch.float32)
+        inv_freq = (1.0 / (theta ** (idx / d_half))).to(x_half.dtype)
+        angles = pos_1d.to(x_half.dtype)[:, None] * inv_freq[None, :]
+        cos = torch.repeat_interleave(torch.cos(angles), 2, dim=-1).unsqueeze(0)
+        sin = torch.repeat_interleave(torch.sin(angles), 2, dim=-1).unsqueeze(0)
+        return x_half * cos + _rotate_half(x_half) * sin
+    def forward(self, x, positions_2d):
+        d_half = x.shape[-1] // 2
+        x_y = self._rope_half(x[:, :, :d_half], positions_2d[:, 0], self.theta)
+        x_x = self._rope_half(x[:, :, d_half:], positions_2d[:, 1], self.theta)
+        return torch.cat([x_y, x_x], dim=-1)
+class VisionAttention(nn.Module):
+    """Multi-headed attention with 2D RoPE + FlashAttention varlen."""
+    def __init__(self, config, rope=None, rope_shift=0):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got embed_dim={self.embed_dim}, num_heads={self.num_heads})"
+            )
+        self.scale = self.head_dim ** -0.5
+        self.dropout = config.attention_dropout
+        self.subln = config.subln
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=getattr(config, "k_bias", True))
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=getattr(config, "v_bias", True))
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=getattr(config, "q_bias", True))
+        self.inner_attn_ln = Qwen2RMSNorm(self.embed_dim, eps=config.layer_norm_eps) if config.subln else nn.Identity()
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
+        self.rope = rope
+        self.rope_shift = int(rope_shift)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _maybe_flash_attention(self, query_states, key_states, value_states, seq_lens, training):
+        if not (query_states.is_cuda and (query_states.dtype in (torch.float16, torch.bfloat16, torch.float32))):
+            return None
+        if seq_lens is None:
+            return None
+        try:
+            BxH, T, hd = query_states.shape
+            H = self.num_heads
+            assert BxH % H == 0
+            B = BxH // H
+            if int(seq_lens.sum().item()) != T:
+                return None
+            q = query_states.view(B, H, T, hd).transpose(1, 2).reshape(-1, H, hd).contiguous()
+            k = key_states.view(B, H, T, hd).transpose(1, 2).reshape(-1, H, hd).contiguous()
+            v = value_states.view(B, H, T, hd).transpose(1, 2).reshape(-1, H, hd).contiguous()
+            cu_q = torch.zeros(seq_lens.numel() + 1, dtype=torch.int32, device=seq_lens.device)
+            cu_q[1:] = torch.cumsum(seq_lens.to(torch.int32), dim=0)
+            cu_k = cu_q
+            max_seqlen = int(seq_lens.max().item())
+            orig_dtype = q.dtype
+            use_dtype = q.dtype if q.dtype in (torch.float16, torch.bfloat16) else torch.float16
+            if q.dtype != use_dtype:
+                q = q.to(use_dtype)
+                k = k.to(use_dtype)
+                v = v.to(use_dtype)
+            out = flash_attn_varlen_func(
+                q, k, v, cu_q, cu_k, max_seqlen, max_seqlen,
+                dropout_p=self.dropout if training else 0.0,
+                softmax_scale=None, causal=False, return_attn_probs=False
+            )
+            if out.dtype != orig_dtype:
+                out = out.to(orig_dtype)
+            return out.view(B, -1, H, hd).transpose(1, 2).contiguous().view(B * H, T, hd)
+        except Exception:
+            return None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        positions_2d: Optional[torch.Tensor] = None,
+        seq_lens: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = self._shape(query_states, tgt_len, bsz).view(bsz * self.num_heads, tgt_len, self.head_dim)
+        key_states = self._shape(key_states, tgt_len, bsz).view(bsz * self.num_heads, tgt_len, self.head_dim)
+        value_states = self._shape(value_states, tgt_len, bsz).view(bsz * self.num_heads, tgt_len, self.head_dim)
+        if self.rope is not None and positions_2d is not None:
+            if self.rope_shift > 0:
+                q_pref = query_states[:, :self.rope_shift, :]
+                k_pref = key_states[:, :self.rope_shift, :]
+                q_rot = self.rope(query_states[:, self.rope_shift:, :], positions_2d[self.rope_shift:])
+                k_rot = self.rope(key_states[:, self.rope_shift:, :], positions_2d[self.rope_shift:])
+                query_states = torch.cat([q_pref, q_rot], dim=1).type_as(value_states)
+                key_states = torch.cat([k_pref, k_rot], dim=1).type_as(value_states)
+            else:
+                query_states = self.rope(query_states, positions_2d).type_as(value_states)
+                key_states = self.rope(key_states, positions_2d).type_as(value_states)
+        attn_output = self._maybe_flash_attention(
+            query_states, key_states, value_states, seq_lens=seq_lens, training=self.training
+        )
+        if attn_output is not None:
+            attn_weights_reshaped = None
+        else:
+            src_len = key_states.size(1)
+            attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+            if causal_attention_mask is not None:
+                attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+                attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            if attention_mask is not None:
+                attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+                attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+            if output_attentions:
+                attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            else:
+                attn_weights_reshaped = None
+            attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+            attn_output = torch.bmm(attn_probs, value_states)
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.inner_attn_ln(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class VisionSwiGLU(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.w1 = nn.Linear(self.hidden_size, self.intermediate_size)
+        self.w2 = nn.Linear(self.hidden_size, self.intermediate_size)
+        self.w3 = nn.Linear(self.intermediate_size, self.hidden_size)
+        self.act_fn = nn.SiLU()
+        self.ffn_ln = Qwen2RMSNorm(self.intermediate_size, eps=config.layer_norm_eps) if config.subln else nn.Identity()
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act_fn(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        return x
+class VisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.ffn_ln = Qwen2RMSNorm(config.intermediate_size, eps=config.layer_norm_eps) if config.subln else nn.Identity()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.ffn_ln(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class VisionEncoderLayer(nn.Module):
+    def __init__(self, config, rope=None, rope_shift=0):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = VisionAttention(config, rope=rope, rope_shift=rope_shift)
+        self.layer_norm1 = Qwen2RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = VisionSwiGLU(config) if config.swiglu else VisionMLP(config)
+        self.layer_norm2 = Qwen2RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        causal_attention_mask: Optional[torch.Tensor],
+        output_attentions: Optional[bool] = False,
+        positions_2d: Optional[torch.Tensor] = None,
+        seq_lens: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            positions_2d=positions_2d,
+            seq_lens=seq_lens,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class VisionEncoder(nn.Module):
+    def __init__(self, config, rope=None, rope_shift=0):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [VisionEncoderLayer(config, rope=rope, rope_shift=rope_shift) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+        self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        positions_2d: Optional[torch.Tensor] = None,
+        seq_lens: Optional[torch.Tensor] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = True if return_dict is None else return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                def custom_forward(hs, attn, causal, pos2d, seqlens):
+                    return layer(
+                        hs,
+                        attention_mask=attn,
+                        causal_attention_mask=causal,
+                        output_attentions=False,
+                        positions_2d=pos2d,
+                        seq_lens=seqlens,
+                    )[0]
+                hidden_states = self._gradient_checkpointing_func(
+                    custom_forward,
+                    hidden_states,
+                    attention_mask if attention_mask is not None else torch.tensor(0., device=hidden_states.device),
+                    causal_attention_mask if causal_attention_mask is not None else torch.tensor(0., device=hidden_states.device),
+                    positions_2d,
+                    seq_lens if seq_lens is not None else torch.tensor([], device=hidden_states.device),
+                    use_reentrant=False,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                    positions_2d=positions_2d,
+                    seq_lens=seq_lens,
+                )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+class PatchUnMerger(nn.Module):
+    """Learnable inverse of Qwen2_5_VLPatchMerger."""
+    def __init__(self, dim, context_dim, spatial_merge_size=2):
+        super().__init__()
+        self.spatial_merge_size = spatial_merge_size
+        self.context_dim = context_dim
+        hidden = context_dim * (spatial_merge_size ** 2)
+        self.ln_q = Qwen2RMSNorm(dim, eps=1e-6)
+        self.mlp = nn.Sequential(nn.Linear(dim, hidden), nn.GELU(), nn.Linear(hidden, hidden))
+    def forward(self, x):
+        x = self.mlp(self.ln_q(x))
+        return x.view(x.shape[0] * (self.spatial_merge_size ** 2), self.context_dim)
+def restore_spatial_structure_and_convert_to_images(patches, grid_thw_list, patch_size,
+                                                     channel_dim=3, temporal_patch_size=2, merge_size=2):
+    """Convert decoder pixel features back to image tensors [3, H, W]."""
+    if isinstance(patches, tuple):
+        patches = patches[0]
+    image_tensors = []
+    ptr = 0
+    for grid in grid_thw_list:
+        gt, gh, gw = (int(x) for x in (grid if not isinstance(grid, torch.Tensor) else grid.tolist()))
+        n = gt * gh * gw
+        chunk = patches[ptr:ptr + n]
+        ptr += n
+        r = chunk.reshape(gt, gh // merge_size, gw // merge_size, merge_size, merge_size,
+                          channel_dim, temporal_patch_size, patch_size, patch_size)
+        r = r.permute(0, 6, 5, 1, 3, 7, 2, 4, 8)
+        image_tensors.append(r.reshape(gt * temporal_patch_size, channel_dim, gh * patch_size, gw * patch_size)[0])
+    return image_tensors
+class VisionTransformerDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.spatial_merge_size = config.spatial_merge_size
+        self.codebook_dim = config.codebook_dim
+        self.temporal_patch_size = config.temporal_patch_size
+        self.rope2d = VisionRoPE2D(theta=10000.0)
+        self.post_quant_conv = nn.Linear(self.codebook_dim, self.embed_dim)
+        self.post_quant_norm = Qwen2RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.patch_unmerger = PatchUnMerger(self.embed_dim, self.embed_dim, self.spatial_merge_size)
+        self.norm_in = Qwen2RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.encoder = VisionEncoder(config, rope=self.rope2d, rope_shift=0)
+        self.norm_out = Qwen2RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.decoder_head = nn.Sequential(
+            nn.Linear(self.embed_dim, config.intermediate_size), nn.GELU(),
+            nn.Linear(config.intermediate_size, 3 * self.patch_size * self.patch_size * self.temporal_patch_size),
+        )
+    @classmethod
+    def from_pretrained(cls, config, model_path: str):
+        """Load a pretrained model from a checkpoint."""
+        model = cls(config)
+        weight_dict = load_file(model_path, device="cpu")
+        model.load_state_dict({k.removeprefix("image_decoder."): v for k, v in weight_dict.items() if k.startswith("image_decoder.")}, strict=True)
+        model.eval()
+        return model
+    def _build_2d_positions(self, grid_thw_list):
+        pos_list = []
+        for (t, gh, gw) in grid_thw_list:
+            for _ in range(int(t)):
+                for y in range(int(gh)):
+                    for x in range(int(gw)):
+                        pos_list.append([y, x])
+        return torch.tensor(pos_list, dtype=torch.long)
+    def _build_attention_mask(self, grid_thw_list, device, dtype, B, num_heads):
+        counts = [int(t) * int(h) * int(w) for (t, h, w) in grid_thw_list]
+        L = sum(counts)
+        mask = torch.zeros((B, num_heads, L, L), device=device, dtype=dtype)
+        s = 0
+        for c in counts:
+            e = s + c
+            if s > 0:
+                mask[:, :, s:e, :s] = float("-inf")
+            if e < L:
+                mask[:, :, s:e, e:] = float("-inf")
+            s = e
+        return mask
+    def forward(self, embeddings, grid_thw, return_pixel_features=False, return_last_latent=False):
+        device = embeddings.device
+        grid_thw_list = ([(int(t), int(h), int(w)) for t, h, w in grid_thw.detach().cpu().numpy()]
+                         if isinstance(grid_thw, torch.Tensor) else list(grid_thw))
+        if embeddings.shape[-1] == self.codebook_dim:
+            embeddings = self.post_quant_conv(embeddings)
+            embeddings = self.post_quant_norm(embeddings)
+        unmerged = self.patch_unmerger(embeddings)
+        if unmerged.dim() == 2:
+            unmerged = unmerged.unsqueeze(0)
+        B, L, D = unmerged.shape
+        hidden_states = self.norm_in(unmerged)
+        positions_2d = self._build_2d_positions(grid_thw_list).to(device)
+        seq_lens = torch.tensor([int(t) * int(h) * int(w) for (t, h, w) in grid_thw_list],
+                                device=device, dtype=torch.int32)
+        assert positions_2d.shape[0] == L, f"positions_2d {positions_2d.shape[0]} != L {L}"
+        last_latent = hidden_states.detach().squeeze(0) if return_last_latent else None
+        enc_out = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=None,
+            causal_attention_mask=None,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+            positions_2d=positions_2d,
+            seq_lens=seq_lens,
+        )
+        hidden_states = enc_out.last_hidden_state
+        hidden_states = self.norm_out(hidden_states)
+        pixel_features = self.decoder_head(hidden_states).squeeze(0)
+        out_imgs = (None if return_pixel_features else
+                    restore_spatial_structure_and_convert_to_images(
+                        pixel_features, grid_thw_list, self.patch_size,
+                        temporal_patch_size=self.temporal_patch_size, merge_size=self.spatial_merge_size))
+        ret = {"images": out_imgs, "pixel_features": pixel_features}
+        if last_latent is not None:
+            ret["last_latent"] = last_latent
+        return ret

parse_model_response.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import re
+import json
+import uuid
+def parse_arguments(json_value):
+    """
+    Attempt to parse a string as JSON
+    Args:
+        json_value: String to parse
+    Returns:
+        tuple: (parsed_value, is_valid_json)
+    """
+    try:
+        parsed_value = json.loads(json_value)
+        return parsed_value, True
+    except:
+        return json_value, False
+def get_argument_type(func_name: str, arg_key: str, defined_tools: list):
+    """
+    Get the type definition of a tool parameter
+    Args:
+        func_name: Name of the function/tool
+        arg_key: Parameter key name
+        defined_tools: List of tool definitions
+    Returns:
+        str or None: Type of the parameter ('string', 'object', 'array', 'integer', 'number', 'boolean')
+    """
+    name2tool = {tool["name"]: tool for tool in defined_tools}
+    if func_name not in name2tool:
+        return None
+    tool = name2tool[func_name]
+    if "parameters" not in tool or "properties" not in tool["parameters"]:
+        return None
+    if arg_key not in tool["parameters"]["properties"]:
+        return None
+    return tool["parameters"]["properties"][arg_key].get("type")
+def parse_model_response(response: str, defined_tools: list=[]):
+    """
+    Parse model response to extract reasoning_content, content, and tool_calls
+    Args:
+        response: Raw response text from the model
+        defined_tools: List of tool definitions
+    Returns:
+        dict: Message containing role, reasoning_content (optional), content (optional),
+              and tool_calls (optional)
+    """
+    text = response
+    reasoning_content = None
+    content = None
+    tool_calls = []
+    formatted_tools = []
+    for tool in defined_tools:
+        if "function" in tool:
+            formatted_tools.append(tool['function'])
+        else:
+            formatted_tools.append(tool)
+    if '</longcat_think>' in text:
+        text = text.replace('<longcat_think>', '')
+        thinking_end = text.find('</longcat_think>')
+        reasoning_content = text[: thinking_end].strip()
+        text = text[thinking_end + len('</longcat_think>'):].lstrip()
+    assert '<longcat_think>' not in text, "Unclosed <longcat_think> tag found in remaining text"
+    assert '</longcat_think>' not in text, "Unexpected </longcat_think> tag found without opening tag"
+    if '<longcat_tool_call>' in text:
+        index = text.find('<longcat_tool_call>')
+        content = text[:index]
+        text = text[index:].strip()
+    else:
+        content = text
+        text = ""
+    open_tags = text.count('<longcat_tool_call>')
+    close_tags = text.count('</longcat_tool_call>')
+    assert open_tags == close_tags, \
+        f"Mismatched tool_call tags: {open_tags} opening tags, {close_tags} closing tags"
+    tool_call_strs = re.findall(
+        r'<longcat_tool_call>(.*?)</longcat_tool_call>',
+        text,
+        re.DOTALL
+    )
+    for call in tool_call_strs:
+        func_name_match = re.match(r'([^\n<]+)', call.strip())
+        assert func_name_match, f"Missing function name in tool call: {call[:100]}"
+        func_name = func_name_match.group(1).strip()
+        assert func_name, "Empty function name in tool call"
+        # Verify argument tags are properly paired
+        arg_key_count = call.count('<longcat_arg_key>')
+        arg_key_close_count = call.count('</longcat_arg_key>')
+        arg_value_count = call.count('<longcat_arg_value>')
+        arg_value_close_count = call.count('</longcat_arg_value>')
+        assert arg_key_count == arg_key_close_count, \
+            f"Mismatched arg_key tags in function {func_name}: {arg_key_count} opening, {arg_key_close_count} closing"
+        assert arg_value_count == arg_value_close_count, \
+            f"Mismatched arg_value tags in function {func_name}: {arg_value_count} opening, {arg_value_close_count} closing"
+        assert arg_key_count == arg_value_count, \
+            f"Mismatched arg_key and arg_value count in function {func_name}: {arg_key_count} keys, {arg_value_count} values"
+        pairs = re.findall(
+            r'<longcat_arg_key>(.*?)</longcat_arg_key>\s*<longcat_arg_value>(.*?)</longcat_arg_value>',
+            call,
+            re.DOTALL
+        )
+        assert len(pairs) == arg_key_count, \
+            f"Failed to parse all arguments in function {func_name}: expected {arg_key_count}, got {len(pairs)}"
+        arguments = {}
+        for arg_key, arg_value in pairs:
+            arg_key = arg_key.strip()
+            arg_value = arg_value.strip()
+            assert arg_key, f"Empty argument key in function {func_name}"
+            assert arg_key not in arguments, \
+                f"Duplicate argument key '{arg_key}' in function {func_name}"
+            arg_type = get_argument_type(func_name, arg_key, formatted_tools)
+            if arg_type and arg_type != 'string':
+                parsed_value, is_good_json = parse_arguments(arg_value)
+                arg_value = parsed_value
+            arguments[arg_key] = arg_value
+        tool_calls.append({
+            'id': "tool-call-" + str(uuid.uuid4()),
+            'type': "function",
+            'function': {
+                'name': func_name,
+                'arguments': arguments
+            }
+        })
+    message = {'role': 'assistant'}
+    if reasoning_content:
+        message['reasoning_content'] = reasoning_content
+    message['content'] = content
+    if tool_calls:
+        message['tool_calls'] = tool_calls
+    return message

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "processor_class": "LongcatNextProcessor",
+  "auto_map": {
+    "AutoProcessor": "processing_longcat_next.LongcatNextProcessor"
+  },
+  "spatial_merge_size": 2,
+  "max_pixels": 3211264,
+  "min_pixels": 50176,
+  "n_fft": 400,
+  "num_mel_bins": 128,
+  "sampling_rate": 16000,
+  "max_audio_seconds": 30,
+  "hop_length": 160,
+  "kernel_size": 3,
+  "stride_size": 2,
+  "split_overlap": 0.0,
+  "avg_pooler": 4
+}

processing_longcat_next.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import re
+from typing import Union, List
+from types import SimpleNamespace
+import torch
+import librosa
+import soundfile as sf
+import numpy as np
+from transformers import AutoFeatureExtractor
+from transformers.audio_utils import mel_filter_bank
+from transformers.configuration_utils import PretrainedConfig
+from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from transformers.processing_utils import (
+    AudioKwargs,
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    VideosKwargs,
+)
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class LongcatNextProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: ImagesKwargs
+    videos_kwargs: VideosKwargs
+    audio_kwargs: AudioKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "padding_side": "left",
+            "return_attention_mask": False,
+        }
+    }
+class LongcatNextAudioProcessor(FeatureExtractionMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + self.n_fft // 2,
+            num_mel_filters=self.num_mel_bins,
+            min_frequency=0.0,
+            max_frequency=self.sampling_rate / 2.0,
+            sampling_rate=self.sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+        self.window = torch.hann_window(self.n_fft)
+    @staticmethod
+    def zero_mean_unit_var_norm(x):
+        return (x - x.mean()) / torch.sqrt(x.var() + 1e-8)
+    def load_audio_waveform(self, uri, metadata=None, waveform_tensor=None, return_tensors=True, do_normalize=False):
+        if metadata is None or waveform_tensor is None:
+            # 使用 librosa 统一处理所有音频格式（包括 mp3, wav, flac 等）
+            # librosa.load 返回的已经是归一化的 float32 数据
+            waveform_np, sample_rate = librosa.load(uri, sr=None, mono=False)
+            # 转换为 tensor，确保维度为 (channels, samples)
+            if waveform_np.ndim == 1:
+                waveform_tensor = torch.from_numpy(waveform_np).unsqueeze(0)
+            else:
+                waveform_tensor = torch.from_numpy(waveform_np)
+            # 获取音频元信息
+            try:
+                sf_info = sf.info(uri)
+                metadata = SimpleNamespace(
+                    sample_rate=sample_rate,
+                    num_frames=waveform_tensor.shape[1],
+                    num_channels=waveform_tensor.shape[0],
+                    bits_per_sample=getattr(sf_info, 'bits_per_sample', 16),
+                    encoding=getattr(sf_info, 'subtype', 'PCM_F')
+                )
+            except Exception:
+                # 如果 soundfile.info 失败，使用 librosa 提供的信息
+                metadata = SimpleNamespace(
+                    sample_rate=sample_rate,
+                    num_frames=waveform_tensor.shape[1],
+                    num_channels=waveform_tensor.shape[0],
+                    bits_per_sample=16,
+                    encoding='PCM_F'
+                )
+        assert(metadata.num_channels <= 2), "acoustic file with {} channels.".format(metadata.num_channels)  # whisper only accept mono channel audio
+        if self.sampling_rate != metadata.sample_rate:
+            # 使用 torch.functional 进行重采样
+            waveform_tensor = torch.nn.functional.interpolate(
+                waveform_tensor.unsqueeze(0),
+                size=int(waveform_tensor.shape[1] * self.sampling_rate / metadata.sample_rate),
+                mode='linear',
+                align_corners=False
+            ).squeeze(0)
+        # downmix to mono channel https://trac.ffmpeg.org/wiki/AudioChannelManipulation
+        if metadata.num_channels > 1:
+            waveform_tensor = torch.mean(waveform_tensor, dim=0, keepdim=True)
+        # normalized to zero mean (Qwen Audio没有处理 但Whisper官方实现)
+        if do_normalize:
+            waveform_tensor = self.zero_mean_unit_var_norm(waveform_tensor)
+        if return_tensors:  # (channels, samples)
+            return waveform_tensor
+        else:
+            return waveform_tensor.numpy()
+    def split_with_overlap(self, waveform):  # 如果长度超过最大长度限制 分割为带overlap的多段
+        channels, wave_samples = waveform.shape
+        max_audio_samples = self.max_audio_seconds * self.sampling_rate
+        if wave_samples <= max_audio_samples or self.split_overlap < 0:
+            return [waveform]  # 没有超出最大长度or截断逻辑 统一返回list
+        split_waveform, start = [], 0
+        while start < wave_samples:  # 统一按秒数对齐overlap
+            if start > int(self.sampling_rate * self.split_overlap):
+                start -= int(self.sampling_rate * self.split_overlap)  # 0表示没有overlap，>0 overlap对应秒数
+            end = min(start + max_audio_samples, wave_samples)
+            if end - start>= self.n_fft: # 保证至少有一帧数据
+                split_waveform.append(waveform[:, start:end])  # 注意这里可能会切割出特别短的片段 需要在预处理判断并丢弃
+            start = end
+        return split_waveform
+    @classmethod
+    def inference_output_length(self, input_length, kernel_size, stride_size, avg_pooler):
+        # for whisper + bridge
+        encoder_length = (input_length + 2 * (kernel_size // 2) - kernel_size) // 1 + 1  # conv layer1 with pad=1
+        encoder_length = (encoder_length + 2 * (kernel_size // 2) - kernel_size) // stride_size + 1  # conv layer2 with pad=1
+        if avg_pooler > 1:
+            bridge_length = encoder_length // avg_pooler
+        return encoder_length, bridge_length
+    def extract_fbank_features(self, waveform):
+        # ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py
+        channels, wave_samples = waveform.shape
+        assert(wave_samples >= self.n_fft)
+        valid_frame_nums = min(self.max_audio_seconds * self.sampling_rate // self.hop_length, wave_samples // self.hop_length + 1)
+        if wave_samples < self.max_audio_seconds * self.sampling_rate:
+            waveform = torch.nn.functional.pad(waveform, (0, self.max_audio_seconds * self.sampling_rate - wave_samples), "constant", 0)
+        else:
+            waveform = waveform[:, :self.max_audio_seconds * self.sampling_rate]
+        # window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=self.window, return_complex=True)  # fft, len(wave) // n_fft // 2 + 1
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        if waveform.dim() == 2:
+            max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
+            log_spec = torch.maximum(log_spec, max_val - 8.0)
+        else:
+            log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        log_spec = log_spec[0].numpy()  # (channel, filters, samples) -> (filters, samples)
+        log_spec[:, valid_frame_nums:] = 0.0  # pad0
+        return log_spec, valid_frame_nums
+    def process(self, audio_path, **kwargs):
+        metadata, waveform_tensors = None, None
+        waveforms = self.load_audio_waveform(audio_path, metadata, waveform_tensors, True)
+        waveforms = self.split_with_overlap(waveforms)
+        ret_audio, ret_encoder_length, ret_bridge_length = [], [], []
+        for i, waveform in enumerate(waveforms):
+            audio, input_length = self.extract_fbank_features(waveform)
+            encoder_length, bridge_length = self.inference_output_length(input_length, self.kernel_size, self.stride_size, self.avg_pooler)
+            if bridge_length <= 0:
+                continue
+            ret_audio.append(audio)
+            ret_encoder_length.append(encoder_length)
+            ret_bridge_length.append(bridge_length)
+        return ret_audio, ret_encoder_length, ret_bridge_length
+    def __call__(self, audio: Union[str, List[str]], **kwargs):
+        if isinstance(audio, str):
+            audio = [audio]
+        results = {
+            "audio": [],
+            "encoder_length": [],
+            "bridge_length": [],
+        }
+        for audio_path in audio:
+            audio, encoder_length, bridge_length = self.process(audio_path, **kwargs)
+            results["audio"].append(audio)
+            results["encoder_length"].append(encoder_length)
+            results["bridge_length"].append(bridge_length)
+        return results
+class LongcatNextProcessor(ProcessorMixin):
+    attributes = ["image_processor", "video_processor", "audio_processor", "tokenizer"]
+    image_processor_class = "Qwen2VLImageProcessor"
+    video_processor_class = "Qwen2VLImageProcessor"
+    audio_processor_class = "LongcatNextAudioProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor=None, video_processor=None, audio_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, video_processor, audio_processor, tokenizer, chat_template=chat_template)
+        init_token_list = [
+            "image_start_token", "image_end_token", "image_pad_token", "image_newline_token",
+            "audio_start_token", "audio_end_token", "audio_pad_token",
+        ]
+        for attr in init_token_list:
+            token_str = self.tokenizer.init_kwargs.get(attr)
+            token_ids = self.tokenizer.encode(token_str, add_special_tokens=False)
+            assert len(token_ids) == 1, (f"{attr}='{token_str}' encode to get {len(token_ids)} id(s) {token_ids}, expect 1 id")
+            setattr(self, f"{attr}", token_str)
+            setattr(self, f"{attr}_id", token_ids[0])
+    def __call__(
+        self,
+        text: str,
+        **kwargs,
+    ) -> List["LongcatNextProcessorOutput"]:
+        if text is None:
+            raise ValueError("You need to specify either a `text` input to process.")
+        output_kwargs = self._merge_kwargs(
+            LongcatNextProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        assert isinstance(text, str)
+        image_path_list = re.findall(rf"{self.image_start_token}(.*?){self.image_end_token}", text)
+        audio_path_list = re.findall(rf"{self.audio_start_token}(.*?){self.audio_end_token}", text)
+        if len(image_path_list) > 0:
+            images_inputs = self.image_processor(images=image_path_list, **output_kwargs["images_kwargs"])
+            image_grid_thw = images_inputs["image_grid_thw"]
+            for i, image_path in enumerate(image_path_list):
+                image_token_num = image_grid_thw[i][0] * (image_grid_thw[i][1]//self.image_processor.spatial_merge_size) * (image_grid_thw[i][2]//self.image_processor.spatial_merge_size)
+                text = text.replace(f"{self.image_start_token}{image_path}{self.image_end_token}", f"{self.image_start_token}{self.image_pad_token * image_token_num}{self.image_end_token}")
+        else:
+            images_inputs = {}
+        if len(audio_path_list) > 0:
+            audio_inputs = self.audio_processor(audio=audio_path_list, **output_kwargs["audio_kwargs"])
+            for i, audio_path in enumerate(audio_path_list):
+                audio_token_num = np.sum(audio_inputs["bridge_length"][i])
+                text = text.replace(f"{self.audio_start_token}{audio_path}{self.audio_end_token}", f"{self.audio_start_token}{self.audio_pad_token * audio_token_num}{self.audio_end_token}")
+            for key in audio_inputs:
+                audio_inputs[key] = [val for b_val in audio_inputs[key] for val in b_val]
+        else:
+            audio_inputs = {}
+        texts_inputs = self.tokenizer([text], **output_kwargs["text_kwargs"])
+        batch_feature_func = lambda x: BatchFeature(
+            data={**x},
+            tensor_type=kwargs.get("return_tensors"),
+        )
+        return (
+            batch_feature_func(texts_inputs),
+            batch_feature_func({k.replace("image", "visual"): v for k, v in images_inputs.items()}) if len(images_inputs) > 0 else None,
+            batch_feature_func(audio_inputs) if len(audio_inputs) > 0 else None,
+        )
+class LongcatNextAudioProcessorConfig(PretrainedConfig):
+    pass
+AutoFeatureExtractor.register(LongcatNextAudioProcessorConfig, LongcatNextAudioProcessor)
+__all__ = ["LongcatNextAudioProcessor", "LongcatNextProcessor"]

refiner_modules.py ADDED Viewed

	@@ -0,0 +1,1330 @@

+# ---------------------------------------------------------------------------
+# Standard / third-party imports shared by all sections
+# ---------------------------------------------------------------------------
+import itertools
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+from flash_attn import flash_attn_varlen_func  # type: ignore
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # type: ignore
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import RMSNorm
+from einops import rearrange, repeat
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import Attention
+from diffusers.models.embeddings import Timesteps, get_1d_rotary_pos_embed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+logger = logging.get_logger(__name__)
+def swiglu(x, y):
+    return F.silu(x.float(), inplace=False).to(x.dtype) * y
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = get_activation(act_fn)
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+        self.initialize_weights()
+    def initialize_weights(self):
+        nn.init.normal_(self.linear_1.weight, std=0.02)
+        nn.init.zeros_(self.linear_1.bias)
+        nn.init.normal_(self.linear_2.weight, std=0.02)
+        nn.init.zeros_(self.linear_2.bias)
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], x.shape[-1] // 2, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+@dataclass
+class TeaCacheParams:
+    """
+    TeaCache parameters for Transformer2DModel.
+    See https://github.com/ali-vilab/TeaCache/ for a more comprehensive understanding.
+    """
+    previous_residual: Optional[torch.Tensor] = None
+    previous_modulated_inp: Optional[torch.Tensor] = None
+    accumulated_rel_l1_distance: float = 0
+    is_first_or_last_step: bool = False
+def derivative_approximation(*args, **kwargs):
+    pass
+def taylor_formula(*args, **kwargs):
+    pass
+def taylor_cache_init(*args, **kwargs):
+    pass
+def cache_init(*args, **kwargs):
+    pass
+def cal_type(*args, **kwargs):
+    pass
+class LuminaRMSNormZero(nn.Module):
+    """
+    Norm layer adaptive RMS normalization zero.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        norm_eps: float,
+        norm_elementwise_affine: bool,
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(
+            min(embedding_dim, 1024),
+            4 * embedding_dim,
+            bias=True,
+        )
+        self.norm = RMSNorm(embedding_dim, eps=norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = self.linear(self.silu(emb))
+        scale_msa, gate_msa, scale_mlp, gate_mlp = emb.chunk(4, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None])
+        return x, gate_msa, scale_mlp, gate_mlp
+class LuminaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+        out_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear_1 = nn.Linear(conditioning_embedding_dim, embedding_dim, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+        self.linear_2 = None
+        if out_dim is not None:
+            self.linear_2 = nn.Linear(embedding_dim, out_dim, bias=bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        conditioning_embedding: torch.Tensor,
+    ) -> torch.Tensor:
+        emb = self.linear_1(self.silu(conditioning_embedding).to(x.dtype))
+        scale = emb
+        x = self.norm(x) * (1 + scale)[:, None, :]
+        if self.linear_2 is not None:
+            x = self.linear_2(x)
+        return x
+class LuminaFeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        inner_dim: int,
+        multiple_of: Optional[int] = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+    ):
+        super().__init__()
+        if ffn_dim_multiplier is not None:
+            inner_dim = int(ffn_dim_multiplier * inner_dim)
+        inner_dim = multiple_of * ((inner_dim + multiple_of - 1) // multiple_of)
+        self.linear_1 = nn.Linear(dim, inner_dim, bias=False)
+        self.linear_2 = nn.Linear(inner_dim, dim, bias=False)
+        self.linear_3 = nn.Linear(dim, inner_dim, bias=False)
+    def forward(self, x):
+        h1, h2 = self.linear_1(x), self.linear_3(x)
+        return self.linear_2(swiglu(h1, h2))
+class Lumina2CombinedTimestepCaptionEmbedding(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        text_feat_dim: int = 2048,
+        frequency_embedding_size: int = 256,
+        norm_eps: float = 1e-5,
+        timestep_scale: float = 1.0,
+    ) -> None:
+        super().__init__()
+        self.time_proj = Timesteps(
+            num_channels=frequency_embedding_size,
+            flip_sin_to_cos=True,
+            downscale_freq_shift=0.0,
+            scale=timestep_scale,
+        )
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=frequency_embedding_size,
+            time_embed_dim=min(hidden_size, 1024),
+        )
+        self.caption_embedder = nn.Sequential(
+            RMSNorm(text_feat_dim, eps=norm_eps),
+            nn.Linear(text_feat_dim, hidden_size, bias=True),
+        )
+        self._initialize_weights()
+    def _initialize_weights(self):
+        nn.init.trunc_normal_(self.caption_embedder[1].weight, std=0.02)
+        nn.init.zeros_(self.caption_embedder[1].bias)
+    def forward(
+        self, timestep: torch.Tensor, text_hidden_states: torch.Tensor, dtype: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        timestep_proj = self.time_proj(timestep).to(dtype=dtype)
+        time_embed = self.timestep_embedder(timestep_proj)
+        caption_embed = self.caption_embedder(text_hidden_states)
+        return time_embed, caption_embed
+class AttnProcessorFlash2Varlen:
+    """
+    Processor for implementing scaled dot-product attention with flash attention
+    and variable length sequences.
+    """
+    def __init__(self) -> None:
+        pass
+    #     if not is_flash_attn_available():
+    #         raise ImportError(
+    #             "AttnProcessorFlash2Varlen requires flash_attn. "
+    #             "Please install flash_attn."
+    #         )
+    def _upad_input(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: torch.Tensor,
+        query_length: int,
+        num_heads: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
+        def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
+            seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+            indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+            max_seqlen_in_batch = seqlens_in_batch.max().item()
+            cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+            return indices, cu_seqlens, max_seqlen_in_batch
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k,
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask
+            )
+        return (
+            query_layer, key_layer, value_layer, indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        base_sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query_dim = query.shape[-1]
+        inner_dim = key.shape[-1]
+        head_dim = query_dim // attn.heads
+        dtype = query.dtype
+        kv_heads = inner_dim // head_dim
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, kv_heads, head_dim)
+        value = value.view(batch_size, -1, kv_heads, head_dim)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
+            key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
+        query, key = query.to(dtype), key.to(dtype)
+        if base_sequence_length is not None:
+            softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
+        else:
+            softmax_scale = attn.scale
+        (
+            query_states, key_states, value_states, indices_q,
+            cu_seq_lens, max_seq_lens,
+        ) = self._upad_input(query, key, value, attention_mask, sequence_length, attn.heads)
+        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+        if kv_heads < attn.heads:
+            key_states = repeat(key_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
+            value_states = repeat(value_states, "l h c -> l (h k) c", k=attn.heads // kv_heads)
+        attn_output_unpad = flash_attn_varlen_func(
+            query_states, key_states, value_states,
+            cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_in_batch_q, max_seqlen_k=max_seqlen_in_batch_k,
+            dropout_p=0.0, causal=False, softmax_scale=softmax_scale,
+        )
+        hidden_states = pad_input(attn_output_unpad, indices_q, batch_size, sequence_length)
+        hidden_states = hidden_states.flatten(-2)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class AttnProcessor:
+    """
+    Processor for implementing scaled dot-product attention (PyTorch 2.0+).
+    """
+    def __init__(self) -> None:
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnProcessor requires PyTorch 2.0. "
+                "Please upgrade PyTorch to version 2.0 or later."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        base_sequence_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query_dim = query.shape[-1]
+        inner_dim = key.shape[-1]
+        head_dim = query_dim // attn.heads
+        dtype = query.dtype
+        kv_heads = inner_dim // head_dim
+        query = query.view(batch_size, -1, attn.heads, head_dim)
+        key = key.view(batch_size, -1, kv_heads, head_dim)
+        value = value.view(batch_size, -1, kv_heads, head_dim)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, use_real=False)
+            key = apply_rotary_emb(key, image_rotary_emb, use_real=False)
+        query, key = query.to(dtype), key.to(dtype)
+        if base_sequence_length is not None:
+            softmax_scale = math.sqrt(math.log(sequence_length, base_sequence_length)) * attn.scale
+        else:
+            softmax_scale = attn.scale
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool().view(batch_size, 1, 1, -1)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
+        value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, scale=softmax_scale
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class RotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        theta: int,
+        axes_dim: Tuple[int, int, int],
+        axes_lens: Tuple[int, int, int] = (300, 512, 512),
+        patch_size: int = 2,
+    ):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.axes_lens = axes_lens
+        self.patch_size = patch_size
+    @staticmethod
+    def get_freqs_cis(
+        axes_dim: Tuple[int, int, int],
+        axes_lens: Tuple[int, int, int],
+        theta: int,
+    ) -> List[torch.Tensor]:
+        freqs_cis = []
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
+            emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype)
+            freqs_cis.append(emb)
+        return freqs_cis
+    def _get_freqs_cis(self, freqs_cis, ids: torch.Tensor) -> torch.Tensor:
+        device = ids.device
+        if ids.device.type == "mps":
+            ids = ids.to("cpu")
+        result = []
+        for i in range(len(self.axes_dim)):
+            freqs = freqs_cis[i].to(ids.device)
+            index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
+            result.append(
+                torch.gather(freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index)
+            )
+        return torch.cat(result, dim=-1).to(device)
+    def forward(
+        self,
+        freqs_cis,
+        attention_mask,
+        l_effective_ref_img_len,
+        l_effective_img_len,
+        ref_img_sizes,
+        img_sizes,
+        device,
+    ):
+        batch_size = len(attention_mask)
+        p = self.patch_size
+        encoder_seq_len = attention_mask.shape[1]
+        l_effective_cap_len = attention_mask.sum(dim=1).tolist()
+        seq_lengths = [
+            cap_len + sum(ref_img_len) + img_len
+            for cap_len, ref_img_len, img_len in zip(
+                l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len
+            )
+        ]
+        max_seq_len = max(seq_lengths)
+        max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
+        max_img_len = max(l_effective_img_len)
+        position_ids = torch.zeros(batch_size, max_seq_len, 3, dtype=torch.int32, device=device)
+        for i, (cap_seq_len, seq_len) in enumerate(zip(l_effective_cap_len, seq_lengths)):
+            position_ids[i, :cap_seq_len] = repeat(
+                torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3"
+            )
+            pe_shift = cap_seq_len
+            pe_shift_len = cap_seq_len
+            if ref_img_sizes[i] is not None:
+                for ref_img_size, ref_img_len in zip(ref_img_sizes[i], l_effective_ref_img_len[i]):
+                    H, W = ref_img_size
+                    ref_H_tokens, ref_W_tokens = H // p, W // p
+                    assert ref_H_tokens * ref_W_tokens == ref_img_len
+                    row_ids = repeat(
+                        torch.arange(ref_H_tokens, dtype=torch.int32, device=device),
+                        "h -> h w", w=ref_W_tokens,
+                    ).flatten()
+                    col_ids = repeat(
+                        torch.arange(ref_W_tokens, dtype=torch.int32, device=device),
+                        "w -> h w", h=ref_H_tokens,
+                    ).flatten()
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 0] = pe_shift
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 1] = row_ids
+                    position_ids[i, pe_shift_len:pe_shift_len + ref_img_len, 2] = col_ids
+                    pe_shift += max(ref_H_tokens, ref_W_tokens)
+                    pe_shift_len += ref_img_len
+            H, W = img_sizes[i]
+            H_tokens, W_tokens = H // p, W // p
+            assert H_tokens * W_tokens == l_effective_img_len[i]
+            row_ids = repeat(
+                torch.arange(H_tokens, dtype=torch.int32, device=device), "h -> h w", w=W_tokens
+            ).flatten()
+            col_ids = repeat(
+                torch.arange(W_tokens, dtype=torch.int32, device=device), "w -> h w", h=H_tokens
+            ).flatten()
+            assert pe_shift_len + l_effective_img_len[i] == seq_len
+            position_ids[i, pe_shift_len: seq_len, 0] = pe_shift
+            position_ids[i, pe_shift_len: seq_len, 1] = row_ids
+            position_ids[i, pe_shift_len: seq_len, 2] = col_ids
+        freqs_cis = self._get_freqs_cis(freqs_cis, position_ids)
+        cap_freqs_cis = torch.zeros(
+            batch_size, encoder_seq_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        ref_img_freqs_cis = torch.zeros(
+            batch_size, max_ref_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        img_freqs_cis = torch.zeros(
+            batch_size, max_img_len, freqs_cis.shape[-1], device=device, dtype=freqs_cis.dtype
+        )
+        for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(
+            zip(l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len, seq_lengths)
+        ):
+            cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
+            ref_img_freqs_cis[i, :sum(ref_img_len)] = freqs_cis[
+                i, cap_seq_len:cap_seq_len + sum(ref_img_len)
+            ]
+            img_freqs_cis[i, :img_len] = freqs_cis[
+                i,
+                cap_seq_len + sum(ref_img_len):cap_seq_len + sum(ref_img_len) + img_len,
+            ]
+        return (
+            cap_freqs_cis,
+            ref_img_freqs_cis,
+            img_freqs_cis,
+            freqs_cis,
+            l_effective_cap_len,
+            seq_lengths,
+        )
+class TransformerBlock(nn.Module):
+    """
+    Transformer block for refiner model.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        modulation: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_dim = dim // num_attention_heads
+        self.modulation = modulation
+        try:
+            processor = AttnProcessorFlash2Varlen()
+        except ImportError:
+            processor = AttnProcessor()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // num_attention_heads,
+            qk_norm="rms_norm",
+            heads=num_attention_heads,
+            kv_heads=num_kv_heads,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=processor,
+        )
+        self.feed_forward = LuminaFeedForward(
+            dim=dim,
+            inner_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        if modulation:
+            self.norm1 = LuminaRMSNormZero(
+                embedding_dim=dim,
+                norm_eps=norm_eps,
+                norm_elementwise_affine=True,
+            )
+        else:
+            self.norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.initialize_weights()
+    def initialize_weights(self) -> None:
+        nn.init.xavier_uniform_(self.attn.to_q.weight)
+        nn.init.xavier_uniform_(self.attn.to_k.weight)
+        nn.init.xavier_uniform_(self.attn.to_v.weight)
+        nn.init.xavier_uniform_(self.attn.to_out[0].weight)
+        nn.init.xavier_uniform_(self.feed_forward.linear_1.weight)
+        nn.init.xavier_uniform_(self.feed_forward.linear_2.weight)
+        nn.init.xavier_uniform_(self.feed_forward.linear_3.weight)
+        if self.modulation:
+            nn.init.zeros_(self.norm1.linear.weight)
+            nn.init.zeros_(self.norm1.linear.bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        enable_taylorseer = getattr(self, 'enable_taylorseer', False)
+        if enable_taylorseer:
+            if self.modulation:
+                if temb is None:
+                    raise ValueError("temb must be provided when modulation is enabled")
+                if self.current['type'] == 'full':
+                    self.current['module'] = 'total'
+                    taylor_cache_init(cache_dic=self.cache_dic, current=self.current)
+                    norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+                    attn_output = self.attn(
+                        hidden_states=norm_hidden_states,
+                        encoder_hidden_states=norm_hidden_states,
+                        attention_mask=attention_mask,
+                        image_rotary_emb=image_rotary_emb,
+                    )
+                    hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
+                    mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+                    hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+                    derivative_approximation(cache_dic=self.cache_dic, current=self.current, feature=hidden_states)
+                elif self.current['type'] == 'Taylor':
+                    self.current['module'] = 'total'
+                    hidden_states = taylor_formula(cache_dic=self.cache_dic, current=self.current)
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+                attn_output = self.attn(
+                    hidden_states=norm_hidden_states,
+                    encoder_hidden_states=norm_hidden_states,
+                    attention_mask=attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                )
+                hidden_states = hidden_states + self.norm2(attn_output)
+                mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
+                hidden_states = hidden_states + self.ffn_norm2(mlp_output)
+        else:
+            if self.modulation:
+                if temb is None:
+                    raise ValueError("temb must be provided when modulation is enabled")
+                norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
+                attn_output = self.attn(
+                    hidden_states=norm_hidden_states,
+                    encoder_hidden_states=norm_hidden_states,
+                    attention_mask=attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                )
+                hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output)
+                mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
+                hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+                attn_output = self.attn(
+                    hidden_states=norm_hidden_states,
+                    encoder_hidden_states=norm_hidden_states,
+                    attention_mask=attention_mask,
+                    image_rotary_emb=image_rotary_emb,
+                )
+                hidden_states = hidden_states + self.norm2(attn_output)
+                mlp_output = self.feed_forward(self.ffn_norm1(hidden_states))
+                hidden_states = hidden_states + self.ffn_norm2(mlp_output)
+        return hidden_states
+class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    """
+    Transformer 2D Model.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["TransformerBlock"]
+    _skip_layerwise_casting_patterns = ["x_embedder", "norm"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        out_channels: Optional[int] = None,
+        hidden_size: int = 2304,
+        num_layers: int = 26,
+        num_refiner_layers: int = 2,
+        num_attention_heads: int = 24,
+        num_kv_heads: int = 8,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        axes_dim_rope: Tuple[int, int, int] = (32, 32, 32),
+        axes_lens: Tuple[int, int, int] = (300, 512, 512),
+        text_feat_dim: int = 1024,
+        timestep_scale: float = 1.0,
+    ) -> None:
+        super().__init__()
+        if (hidden_size // num_attention_heads) != sum(axes_dim_rope):
+            raise ValueError(
+                f"hidden_size // num_attention_heads ({hidden_size // num_attention_heads}) "
+                f"must equal sum(axes_dim_rope) ({sum(axes_dim_rope)})"
+            )
+        self.out_channels = out_channels or in_channels
+        self.rope_embedder = RotaryPosEmbed(
+            theta=10000,
+            axes_dim=axes_dim_rope,
+            axes_lens=axes_lens,
+            patch_size=patch_size,
+        )
+        self.x_embedder = nn.Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=hidden_size,
+        )
+        self.ref_image_patch_embedder = nn.Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=hidden_size,
+        )
+        self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
+            hidden_size=hidden_size,
+            text_feat_dim=text_feat_dim,
+            norm_eps=norm_eps,
+            timestep_scale=timestep_scale,
+        )
+        self.noise_refiner = nn.ModuleList([
+            TransformerBlock(
+                hidden_size, num_attention_heads, num_kv_heads,
+                multiple_of, ffn_dim_multiplier, norm_eps, modulation=True,
+            )
+            for _ in range(num_refiner_layers)
+        ])
+        self.ref_image_refiner = nn.ModuleList([
+            TransformerBlock(
+                hidden_size, num_attention_heads, num_kv_heads,
+                multiple_of, ffn_dim_multiplier, norm_eps, modulation=True,
+            )
+            for _ in range(num_refiner_layers)
+        ])
+        self.context_refiner = nn.ModuleList([
+            TransformerBlock(
+                hidden_size, num_attention_heads, num_kv_heads,
+                multiple_of, ffn_dim_multiplier, norm_eps, modulation=False,
+            )
+            for _ in range(num_refiner_layers)
+        ])
+        self.layers = nn.ModuleList([
+            TransformerBlock(
+                hidden_size, num_attention_heads, num_kv_heads,
+                multiple_of, ffn_dim_multiplier, norm_eps, modulation=True,
+            )
+            for _ in range(num_layers)
+        ])
+        self.norm_out = LuminaLayerNormContinuous(
+            embedding_dim=hidden_size,
+            conditioning_embedding_dim=min(hidden_size, 1024),
+            elementwise_affine=False,
+            eps=1e-6,
+            bias=True,
+            out_dim=patch_size * patch_size * self.out_channels,
+        )
+        self.image_index_embedding = nn.Parameter(torch.randn(5, hidden_size))
+        self.gradient_checkpointing = False
+        self.initialize_weights()
+        self.enable_teacache = False
+        self.teacache_rel_l1_thresh = 0.05
+        self.teacache_params = TeaCacheParams()
+        coefficients = [-5.48259225, 11.48772289, -4.47407401, 2.47730926, -0.03316487]
+        self.rescale_func = np.poly1d(coefficients)
+    def initialize_weights(self) -> None:
+        nn.init.xavier_uniform_(self.x_embedder.weight)
+        nn.init.constant_(self.x_embedder.bias, 0.0)
+        nn.init.xavier_uniform_(self.ref_image_patch_embedder.weight)
+        nn.init.constant_(self.ref_image_patch_embedder.bias, 0.0)
+        nn.init.zeros_(self.norm_out.linear_1.weight)
+        nn.init.zeros_(self.norm_out.linear_1.bias)
+        nn.init.zeros_(self.norm_out.linear_2.weight)
+        nn.init.zeros_(self.norm_out.linear_2.bias)
+        nn.init.normal_(self.image_index_embedding, std=0.02)
+    def img_patch_embed_and_refine(
+        self,
+        hidden_states,
+        ref_image_hidden_states,
+        padded_img_mask,
+        padded_ref_img_mask,
+        noise_rotary_emb,
+        ref_img_rotary_emb,
+        l_effective_ref_img_len,
+        l_effective_img_len,
+        temb,
+    ):
+        batch_size = len(hidden_states)
+        max_combined_img_len = max([
+            img_len + sum(ref_img_len)
+            for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)
+        ])
+        hidden_states = self.x_embedder(hidden_states)
+        ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states)
+        for i in range(batch_size):
+            shift = 0
+            for j, ref_img_len in enumerate(l_effective_ref_img_len[i]):
+                ref_image_hidden_states[i, shift:shift + ref_img_len, :] = (
+                    ref_image_hidden_states[i, shift:shift + ref_img_len, :]
+                    + self.image_index_embedding[j]
+                )
+                shift += ref_img_len
+        for layer in self.noise_refiner:
+            hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb)
+        flat_l_effective_ref_img_len = list(itertools.chain(*l_effective_ref_img_len))
+        num_ref_images = len(flat_l_effective_ref_img_len)
+        max_ref_img_len = max(flat_l_effective_ref_img_len)
+        batch_ref_img_mask = ref_image_hidden_states.new_zeros(num_ref_images, max_ref_img_len, dtype=torch.bool)
+        batch_ref_image_hidden_states = ref_image_hidden_states.new_zeros(
+            num_ref_images, max_ref_img_len, self.config.hidden_size
+        )
+        batch_ref_img_rotary_emb = hidden_states.new_zeros(
+            num_ref_images, max_ref_img_len, ref_img_rotary_emb.shape[-1], dtype=ref_img_rotary_emb.dtype
+        )
+        batch_temb = temb.new_zeros(num_ref_images, *temb.shape[1:], dtype=temb.dtype)
+        idx = 0
+        for i in range(batch_size):
+            shift = 0
+            for ref_img_len in l_effective_ref_img_len[i]:
+                batch_ref_img_mask[idx, :ref_img_len] = True
+                batch_ref_image_hidden_states[idx, :ref_img_len] = ref_image_hidden_states[i, shift:shift + ref_img_len]
+                batch_ref_img_rotary_emb[idx, :ref_img_len] = ref_img_rotary_emb[i, shift:shift + ref_img_len]
+                batch_temb[idx] = temb[i]
+                shift += ref_img_len
+                idx += 1
+        for layer in self.ref_image_refiner:
+            batch_ref_image_hidden_states = layer(
+                batch_ref_image_hidden_states, batch_ref_img_mask, batch_ref_img_rotary_emb, batch_temb
+            )
+        idx = 0
+        for i in range(batch_size):
+            shift = 0
+            for ref_img_len in l_effective_ref_img_len[i]:
+                ref_image_hidden_states[i, shift:shift + ref_img_len] = batch_ref_image_hidden_states[idx, :ref_img_len]
+                shift += ref_img_len
+                idx += 1
+        combined_img_hidden_states = hidden_states.new_zeros(
+            batch_size, max_combined_img_len, self.config.hidden_size
+        )
+        for i, (ref_img_len, img_len) in enumerate(zip(l_effective_ref_img_len, l_effective_img_len)):
+            combined_img_hidden_states[i, :sum(ref_img_len)] = ref_image_hidden_states[i, :sum(ref_img_len)]
+            combined_img_hidden_states[i, sum(ref_img_len):sum(ref_img_len) + img_len] = hidden_states[i, :img_len]
+        return combined_img_hidden_states
+    def flat_and_pad_to_seq(self, hidden_states, ref_image_hidden_states):
+        batch_size = len(hidden_states)
+        p = self.config.patch_size
+        device = hidden_states[0].device
+        img_sizes = [(img.size(1), img.size(2)) for img in hidden_states]
+        l_effective_img_len = [(H // p) * (W // p) for (H, W) in img_sizes]
+        if ref_image_hidden_states is not None and len(ref_image_hidden_states) > 0:
+            ref_img_sizes = [
+                [(img.size(1), img.size(2)) for img in imgs] if imgs is not None else None
+                for imgs in ref_image_hidden_states
+            ]
+            l_effective_ref_img_len = [
+                [(ref_img_size[0] // p) * (ref_img_size[1] // p) for ref_img_size in _ref_img_sizes]
+                if _ref_img_sizes is not None else [0]
+                for _ref_img_sizes in ref_img_sizes
+            ]
+        else:
+            ref_img_sizes = [None for _ in range(batch_size)]
+            l_effective_ref_img_len = [[0] for _ in range(batch_size)]
+        max_ref_img_len = max([sum(ref_img_len) for ref_img_len in l_effective_ref_img_len])
+        max_img_len = max(l_effective_img_len)
+        flat_ref_img_hidden_states = []
+        for i in range(batch_size):
+            if ref_img_sizes[i] is not None:
+                imgs = []
+                for ref_img in ref_image_hidden_states[i]:
+                    C, H, W = ref_img.size()
+                    ref_img = rearrange(ref_img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
+                    imgs.append(ref_img)
+                flat_ref_img_hidden_states.append(torch.cat(imgs, dim=0))
+            else:
+                flat_ref_img_hidden_states.append(None)
+        flat_hidden_states = []
+        for i in range(batch_size):
+            img = hidden_states[i]
+            C, H, W = img.size()
+            img = rearrange(img, 'c (h p1) (w p2) -> (h w) (p1 p2 c)', p1=p, p2=p)
+            flat_hidden_states.append(img)
+        padded_ref_img_hidden_states = torch.zeros(
+            batch_size, max_ref_img_len, flat_hidden_states[0].shape[-1],
+            device=device, dtype=flat_hidden_states[0].dtype,
+        )
+        padded_ref_img_mask = torch.zeros(batch_size, max_ref_img_len, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            if ref_img_sizes[i] is not None:
+                padded_ref_img_hidden_states[i, :sum(l_effective_ref_img_len[i])] = flat_ref_img_hidden_states[i]
+                padded_ref_img_mask[i, :sum(l_effective_ref_img_len[i])] = True
+        padded_hidden_states = torch.zeros(
+            batch_size, max_img_len, flat_hidden_states[0].shape[-1],
+            device=device, dtype=flat_hidden_states[0].dtype,
+        )
+        padded_img_mask = torch.zeros(batch_size, max_img_len, dtype=torch.bool, device=device)
+        for i in range(batch_size):
+            padded_hidden_states[i, :l_effective_img_len[i]] = flat_hidden_states[i]
+            padded_img_mask[i, :l_effective_img_len[i]] = True
+        return (
+            padded_hidden_states,
+            padded_ref_img_hidden_states,
+            padded_img_mask,
+            padded_ref_img_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+        )
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, List[torch.Tensor]],
+        timestep: torch.Tensor,
+        text_hidden_states: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        text_attention_mask: torch.Tensor,
+        ref_image_hidden_states: Optional[List[List[torch.Tensor]]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        enable_taylorseer = getattr(self, 'enable_taylorseer', False)
+        if enable_taylorseer:
+            cal_type(self.cache_dic, self.current)
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size = len(hidden_states)
+        is_hidden_states_tensor = isinstance(hidden_states, torch.Tensor)
+        if is_hidden_states_tensor:
+            assert hidden_states.ndim == 4
+            hidden_states = [_hidden_states for _hidden_states in hidden_states]
+        device = hidden_states[0].device
+        assert isinstance(text_hidden_states, torch.Tensor), \
+            f"text_hidden_states must be Tensor, got {type(text_hidden_states)}. " \
+            f"Check if freqs_cis and text_hidden_states are swapped in the caller."
+        temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
+        (
+            hidden_states,
+            ref_image_hidden_states,
+            img_mask,
+            ref_img_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+        ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
+        (
+            context_rotary_emb,
+            ref_img_rotary_emb,
+            noise_rotary_emb,
+            rotary_emb,
+            encoder_seq_lengths,
+            seq_lengths,
+        ) = self.rope_embedder(
+            freqs_cis,
+            text_attention_mask,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            ref_img_sizes,
+            img_sizes,
+            device,
+        )
+        # 2. Context refinement
+        for layer in self.context_refiner:
+            text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb)
+        combined_img_hidden_states = self.img_patch_embed_and_refine(
+            hidden_states,
+            ref_image_hidden_states,
+            img_mask,
+            ref_img_mask,
+            noise_rotary_emb,
+            ref_img_rotary_emb,
+            l_effective_ref_img_len,
+            l_effective_img_len,
+            temb,
+        )
+        # 3. Joint Transformer blocks
+        max_seq_len = max(seq_lengths)
+        attention_mask = hidden_states.new_zeros(batch_size, max_seq_len, dtype=torch.bool)
+        joint_hidden_states = hidden_states.new_zeros(batch_size, max_seq_len, self.config.hidden_size)
+        for i, (encoder_seq_len, seq_len) in enumerate(zip(encoder_seq_lengths, seq_lengths)):
+            attention_mask[i, :seq_len] = True
+            joint_hidden_states[i, :encoder_seq_len] = text_hidden_states[i, :encoder_seq_len]
+            joint_hidden_states[i, encoder_seq_len:seq_len] = combined_img_hidden_states[i, :seq_len - encoder_seq_len]
+        hidden_states = joint_hidden_states
+        if self.enable_teacache:
+            teacache_hidden_states = hidden_states.clone()
+            teacache_temb = temb.clone()
+            modulated_inp, _, _, _ = self.layers[0].norm1(teacache_hidden_states, teacache_temb)
+            if self.teacache_params.is_first_or_last_step:
+                should_calc = True
+                self.teacache_params.accumulated_rel_l1_distance = 0
+            else:
+                self.teacache_params.accumulated_rel_l1_distance += self.rescale_func(
+                    ((modulated_inp - self.teacache_params.previous_modulated_inp).abs().mean()
+                     / self.teacache_params.previous_modulated_inp.abs().mean()).cpu().item()
+                )
+                if self.teacache_params.accumulated_rel_l1_distance < self.teacache_rel_l1_thresh:
+                    should_calc = False
+                else:
+                    should_calc = True
+                    self.teacache_params.accumulated_rel_l1_distance = 0
+            self.teacache_params.previous_modulated_inp = modulated_inp
+        if self.enable_teacache:
+            if not should_calc:
+                hidden_states += self.teacache_params.previous_residual
+            else:
+                ori_hidden_states = hidden_states.clone()
+                for layer_idx, layer in enumerate(self.layers):
+                    if torch.is_grad_enabled() and self.gradient_checkpointing:
+                        hidden_states = self._gradient_checkpointing_func(
+                            layer, hidden_states, attention_mask, rotary_emb, temb
+                        )
+                    else:
+                        hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
+                self.teacache_params.previous_residual = hidden_states - ori_hidden_states
+        else:
+            if enable_taylorseer:
+                self.current['stream'] = 'layers_stream'
+            for layer_idx, layer in enumerate(self.layers):
+                if enable_taylorseer:
+                    layer.current = self.current
+                    layer.cache_dic = self.cache_dic
+                    layer.enable_taylorseer = True
+                    self.current['layer'] = layer_idx
+                if torch.is_grad_enabled() and self.gradient_checkpointing:
+                    hidden_states = self._gradient_checkpointing_func(
+                        layer, hidden_states, attention_mask, rotary_emb, temb
+                    )
+                else:
+                    hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb)
+        hidden_states = self.norm_out(hidden_states, temb)
+        p = self.config.patch_size
+        output = []
+        for i, (img_size, img_len, seq_len) in enumerate(zip(img_sizes, l_effective_img_len, seq_lengths)):
+            height, width = img_size
+            output.append(rearrange(
+                hidden_states[i][seq_len - img_len:seq_len],
+                '(h w) (p1 p2 c) -> c (h p1) (w p2)',
+                h=height // p, w=width // p, p1=p, p2=p,
+            ))
+        if is_hidden_states_tensor:
+            output = torch.stack(output, dim=0)
+        if USE_PEFT_BACKEND:
+            unscale_lora_layers(self, lora_scale)
+        if enable_taylorseer:
+            self.current['step'] += 1
+        if not return_dict:
+            return output
+        return Transformer2DModelOutput(sample=output)
+# ---------------------------------------------------------------------------
+# FlowMatch Euler Discrete Scheduler (merged from scheduling_flow_match_euler_discrete.py)
+# ---------------------------------------------------------------------------
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    prev_sample: torch.FloatTensor
+class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(self, num_train_timesteps: int = 1000, dynamic_time_shift: bool = False):
+        timesteps = torch.linspace(0, 1, num_train_timesteps + 1, dtype=torch.float32)[:-1]
+        self.timesteps = timesteps
+        self._step_index = None
+        self._begin_index = None
+    @property
+    def step_index(self):
+        return self._step_index
+    @property
+    def begin_index(self):
+        return self._begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        self._begin_index = begin_index
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self._timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def set_timesteps(self, num_inference_steps=None, device=None, timesteps=None, num_tokens=None):
+        if timesteps is None:
+            self.num_inference_steps = num_inference_steps
+            timesteps = np.linspace(0, 1, num_inference_steps + 1, dtype=np.float32)[:-1]
+            if self.config.dynamic_time_shift and num_tokens is not None:
+                m = np.sqrt(num_tokens) / 40
+                timesteps = timesteps / (m - m * timesteps + timesteps)
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device)
+        _timesteps = torch.cat([timesteps, torch.ones(1, device=timesteps.device)])
+        self.timesteps = timesteps
+        self._timesteps = _timesteps
+        self._step_index = None
+        self._begin_index = None
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(self, model_output, timestep, sample, generator=None, return_dict=True):
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
+            raise ValueError("Pass scheduler.timesteps values, not integer indices.")
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        sample = sample.to(torch.float32)
+        t = self._timesteps[self.step_index]
+        t_next = self._timesteps[self.step_index + 1]
+        prev_sample = sample + (t_next - t) * model_output
+        prev_sample = prev_sample.to(model_output.dtype)
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps

requirements-post.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ flash-attn==2.7.4.post1

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0
+accelerate==1.10.0
+transformers==4.57.6
+librosa==0.11.0
+diffusers==0.34.0

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2294 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": true,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<longcat_unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<longcat_s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</longcat_s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<longcat_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<shift_unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<shift_s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "</shift_s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<shift_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<mask_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<program_lang>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<|image_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<|url_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "<|hyperlink_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<|table_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "<|equation_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "<|code_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "<|reference_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "<longcat_think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "</longcat_think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "<longcat_answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "39": {
+      "content": "</longcat_answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "40": {
+      "content": "<longcat_files>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "41": {
+      "content": "</longcat_files>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "42": {
+      "content": "<longcat_tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "</longcat_tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "<longcat_tool_declare>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "45": {
+      "content": "</longcat_tool_declare>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46": {
+      "content": "<longcat_system>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "47": {
+      "content": "<longcat_user>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "48": {
+      "content": "<longcat_assistant>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49": {
+      "content": "<longcat_tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50": {
+      "content": "</longcat_tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "51": {
+      "content": "<longcat_arg_key>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "52": {
+      "content": "</longcat_arg_key>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "53": {
+      "content": "<longcat_arg_value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "54": {
+      "content": "</longcat_arg_value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "55": {
+      "content": "<mask_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "56": {
+      "content": "<mask_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57": {
+      "content": "<mask_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "58": {
+      "content": "<mask_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59": {
+      "content": "<mask_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "60": {
+      "content": "<mask_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "61": {
+      "content": "<mask_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "62": {
+      "content": "<mask_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "63": {
+      "content": "<mask_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64": {
+      "content": "<mask_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65": {
+      "content": "<mask_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "66": {
+      "content": "<mask_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "67": {
+      "content": "<mask_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "68": {
+      "content": "<mask_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "69": {
+      "content": "<mask_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70": {
+      "content": "<mask_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "71": {
+      "content": "<mask_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "72": {
+      "content": "<mask_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73": {
+      "content": "<mask_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "74": {
+      "content": "<mask_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "75": {
+      "content": "<mask_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "76": {
+      "content": "<mask_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "77": {
+      "content": "<mask_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "78": {
+      "content": "<mask_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "79": {
+      "content": "<mask_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "80": {
+      "content": "<mask_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "81": {
+      "content": "<mask_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "82": {
+      "content": "<mask_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "83": {
+      "content": "<mask_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "84": {
+      "content": "<mask_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "85": {
+      "content": "<mask_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "86": {
+      "content": "<mask_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "87": {
+      "content": "<mask_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "88": {
+      "content": "<mask_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "89": {
+      "content": "<mask_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "90": {
+      "content": "<mask_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "91": {
+      "content": "<mask_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92": {
+      "content": "<mask_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "93": {
+      "content": "<mask_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "94": {
+      "content": "<mask_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "95": {
+      "content": "<mask_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96": {
+      "content": "<mask_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "97": {
+      "content": "<mask_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "98": {
+      "content": "<mask_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "99": {
+      "content": "<mask_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "<mask_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "<mask_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "<mask_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "<mask_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "<mask_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "105": {
+      "content": "<mask_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "106": {
+      "content": "<mask_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<mask_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "108": {
+      "content": "<mask_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "109": {
+      "content": "<mask_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110": {
+      "content": "<mask_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "111": {
+      "content": "<mask_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "112": {
+      "content": "<mask_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "113": {
+      "content": "<mask_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "114": {
+      "content": "<mask_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "115": {
+      "content": "<mask_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "116": {
+      "content": "<mask_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "117": {
+      "content": "<mask_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "118": {
+      "content": "<mask_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "119": {
+      "content": "<mask_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "120": {
+      "content": "<mask_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "121": {
+      "content": "<mask_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "122": {
+      "content": "<mask_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "123": {
+      "content": "<mask_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "124": {
+      "content": "<mask_100>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "125": {
+      "content": "<mask_101>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "126": {
+      "content": "<mask_102>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "127": {
+      "content": "<mask_103>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128": {
+      "content": "<mask_104>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "129": {
+      "content": "<mask_105>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "130": {
+      "content": "<mask_106>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131": {
+      "content": "<mask_107>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "132": {
+      "content": "<mask_108>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "133": {
+      "content": "<mask_109>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "134": {
+      "content": "<mask_110>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "135": {
+      "content": "<mask_111>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "136": {
+      "content": "<mask_112>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "137": {
+      "content": "<mask_113>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "138": {
+      "content": "<mask_114>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "139": {
+      "content": "<mask_115>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "140": {
+      "content": "<mask_116>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "141": {
+      "content": "<mask_117>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "142": {
+      "content": "<mask_118>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "143": {
+      "content": "<mask_119>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "144": {
+      "content": "<mask_120>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "145": {
+      "content": "<mask_121>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "146": {
+      "content": "<mask_122>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "147": {
+      "content": "<mask_123>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "148": {
+      "content": "<mask_124>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "149": {
+      "content": "<mask_125>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "150": {
+      "content": "<mask_126>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151": {
+      "content": "<mask_127>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152": {
+      "content": "<mask_128>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "153": {
+      "content": "<mask_129>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "154": {
+      "content": "<mask_130>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "155": {
+      "content": "<mask_131>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "156": {
+      "content": "<mask_132>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "157": {
+      "content": "<mask_133>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "158": {
+      "content": "<mask_134>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "159": {
+      "content": "<mask_135>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "160": {
+      "content": "<mask_136>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "161": {
+      "content": "<mask_137>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "162": {
+      "content": "<mask_138>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "163": {
+      "content": "<mask_139>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "164": {
+      "content": "<mask_140>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "165": {
+      "content": "<mask_141>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "166": {
+      "content": "<mask_142>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "167": {
+      "content": "<mask_143>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "168": {
+      "content": "<mask_144>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "169": {
+      "content": "<mask_145>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "170": {
+      "content": "<mask_146>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "171": {
+      "content": "<mask_147>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "172": {
+      "content": "<mask_148>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "173": {
+      "content": "<mask_149>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "174": {
+      "content": "<mask_150>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "175": {
+      "content": "<mask_151>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "176": {
+      "content": "<mask_152>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "177": {
+      "content": "<mask_153>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "178": {
+      "content": "<mask_154>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "179": {
+      "content": "<mask_155>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "180": {
+      "content": "<mask_156>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "181": {
+      "content": "<mask_157>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "182": {
+      "content": "<mask_158>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "183": {
+      "content": "<mask_159>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "184": {
+      "content": "<mask_160>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "185": {
+      "content": "<mask_161>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "186": {
+      "content": "<mask_162>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "187": {
+      "content": "<mask_163>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "188": {
+      "content": "<mask_164>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "189": {
+      "content": "<mask_165>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "190": {
+      "content": "<mask_166>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "191": {
+      "content": "<mask_167>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "192": {
+      "content": "<mask_168>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "193": {
+      "content": "<mask_169>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "194": {
+      "content": "<mask_170>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "195": {
+      "content": "<mask_171>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "196": {
+      "content": "<mask_172>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "197": {
+      "content": "<mask_173>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "198": {
+      "content": "<mask_174>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "199": {
+      "content": "<mask_175>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200": {
+      "content": "<mask_176>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "201": {
+      "content": "<mask_177>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "202": {
+      "content": "<mask_178>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "203": {
+      "content": "<mask_179>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "204": {
+      "content": "<mask_180>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "205": {
+      "content": "<mask_181>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "206": {
+      "content": "<mask_182>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "207": {
+      "content": "<mask_183>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "208": {
+      "content": "<mask_184>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "209": {
+      "content": "<mask_185>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "210": {
+      "content": "<mask_186>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "211": {
+      "content": "<mask_187>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "212": {
+      "content": "<mask_188>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "213": {
+      "content": "<mask_189>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "214": {
+      "content": "<mask_190>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "215": {
+      "content": "<mask_191>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "216": {
+      "content": "<mask_192>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "217": {
+      "content": "<mask_193>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "218": {
+      "content": "<mask_194>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "219": {
+      "content": "<mask_195>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "220": {
+      "content": "<mask_196>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "221": {
+      "content": "<mask_197>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "222": {
+      "content": "<mask_198>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "223": {
+      "content": "<mask_199>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131072": {
+      "content": "<mask_131048>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131073": {
+      "content": "<mask_131049>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131074": {
+      "content": "<mask_131050>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131075": {
+      "content": "<mask_131051>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131076": {
+      "content": "<mask_131052>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131077": {
+      "content": "<mask_131053>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131078": {
+      "content": "<mask_131054>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131079": {
+      "content": "<mask_131055>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131080": {
+      "content": "<mask_131056>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131081": {
+      "content": "<mask_131057>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131082": {
+      "content": "<mask_131058>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131083": {
+      "content": "<mask_131059>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131084": {
+      "content": "<mask_131060>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131085": {
+      "content": "<mask_131061>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131086": {
+      "content": "<mask_131062>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131087": {
+      "content": "<mask_131063>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131088": {
+      "content": "<mask_131064>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131089": {
+      "content": "<mask_131065>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131090": {
+      "content": "<longcat_img_token_size>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131091": {
+      "content": "</longcat_img_token_size>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131092": {
+      "content": "<mask_131068>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131093": {
+      "content": "<mask_131069>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131094": {
+      "content": "<mask_131070>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131095": {
+      "content": "<mask_131071>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131096": {
+      "content": "<longcat_point_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131097": {
+      "content": "<longcat_point_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131098": {
+      "content": "<longcat_point_delim>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131099": {
+      "content": "<longcat_polygon_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131100": {
+      "content": "<longcat_polygon_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131101": {
+      "content": "<mask_131077>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131102": {
+      "content": "<mask_131078>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131103": {
+      "content": "<longcat_audio_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131104": {
+      "content": "<longcat_audio_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131105": {
+      "content": "<longcat_audio_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131106": {
+      "content": "<longcat_img_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131107": {
+      "content": "<longcat_img_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131108": {
+      "content": "<longcat_img_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131109": {
+      "content": "<longcat_img_newline>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131110": {
+      "content": "<longcat_box_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131111": {
+      "content": "<longcat_box_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131112": {
+      "content": "<longcat_box_delim>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131113": {
+      "content": "<longcat_ref_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131114": {
+      "content": "<longcat_ref_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131115": {
+      "content": "<longcat_img_delim>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131116": {
+      "content": "<longcat_audio_delim>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131117": {
+      "content": "<longcat_video_palce>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131118": {
+      "content": "<longcat_video_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131119": {
+      "content": "<longcat_video_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131120": {
+      "content": "<longcat_audiotext_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131121": {
+      "content": "<longcat_audiotext_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131122": {
+      "content": "<longcat_audiotext_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131123": {
+      "content": "<longcat_audiogen_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "131124": {
+      "content": "<longcat_audiogen_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<mask_131048>",
+    "<mask_131049>",
+    "<mask_131050>",
+    "<mask_131051>",
+    "<mask_131052>",
+    "<mask_131053>",
+    "<mask_131054>",
+    "<mask_131055>",
+    "<mask_131056>",
+    "<mask_131057>",
+    "<mask_131058>",
+    "<mask_131059>",
+    "<mask_131060>",
+    "<mask_131061>",
+    "<mask_131062>",
+    "<mask_131063>",
+    "<mask_131064>",
+    "<mask_131065>",
+    "<longcat_img_token_size>",
+    "</longcat_img_token_size>",
+    "<mask_131068>",
+    "<mask_131069>",
+    "<mask_131070>",
+    "<mask_131071>",
+    "<longcat_point_start>",
+    "<longcat_point_end>",
+    "<longcat_point_delim>",
+    "<longcat_polygon_start>",
+    "<longcat_polygon_end>",
+    "<mask_131077>",
+    "<mask_131078>",
+    "<longcat_audio_start>",
+    "<longcat_audio_end>",
+    "<longcat_audio_pad>",
+    "<longcat_img_start>",
+    "<longcat_img_end>",
+    "<longcat_img_pad>",
+    "<longcat_img_newline>",
+    "<longcat_box_start>",
+    "<longcat_box_end>",
+    "<longcat_box_delim>",
+    "<longcat_ref_start>",
+    "<longcat_ref_end>",
+    "<longcat_img_delim>",
+    "<longcat_audio_delim>",
+    "<longcat_video_palce>",
+    "<longcat_video_start>",
+    "<longcat_video_end>",
+    "<longcat_audiotext_start>",
+    "<longcat_audiotext_end>",
+    "<longcat_audiotext_pad>",
+    "<longcat_audiogen_start>",
+    "<longcat_audiogen_end>"
+  ],
+  "bos_token": "<longcat_s>",
+  "chat_template": "{%- set tool_choice = tool_choice | default('auto') %}\n{%- set ns = namespace(tool_types = [], last_query_index = -1, suffix_to_move = '') %}\n\n{%- if tools and tool_choice != 'none' %}\n    {{- \"<longcat_tool_declare>\\n\"-}}\n    {{- \"# Tools\\n\" }}\n    {{- \"You have access to the following tools:\\n\\n\" }}\n    {%- for tool in tools %}\n        {%- if tool.type not in ns.tool_types %}\n            {%- set ns.tool_types = ns.tool_types + [tool.type] %}\n            {{- \"## Tool namespace: \" ~ tool.type ~ \"\\n\\n\" }}\n        {%- endif %}\n        {%- if tool.type == 'code_interpreter' %}\n            {%- set tool = {\"type\":\"code_interpreter\",\"function\":{\"name\":\"code_interpreter_preview\",\"description\":\"The code will be executed in a stateful Jupyter notebook sandbox environment, only supports local computation, data processing, and file operations.\\nCode sandbox environment (network isolated) Any external network requests or online API calls are prohibited.\\nIf online functionality is needed, please use other permitted tools.\\nCode will respond with the output of the execution or time out after 60.0 seconds. \",\"parameters\":{\"type\":\"object\",\"properties\":{\"language\":{\"type\":\"string\",\"description\":\"The programming language of the code to be executed. Available values: python (Default), java, go, js, ts, c, c++.\"},\"code\":{\"type\":\"string\",\"description\":\"Python code to be executed must not include the following:\\n- Importing network libraries such as requests, httplib, etc.\\n- Any form of HTTP requests.\\n- External API calls.\\n- Network port operations. Example: ```python\\nimport pandas as pd\\npd.DataFrame({'A':[1,2]})\\n```\"},\"timeout\":{\"type\":\"number\",\"description\":\"The maximum execution time of the code, in seconds. Default is 60.0.\"}}},\"required\":[\"code\"]}} %}\n        {%- endif %}\n        {{- \"### Tool name: \" + tool.function.name + \"\\n\" }}\n        {{- \"Description: \" + tool.function.description + \"\\n\\n\" }}\n        {{- \"InputSchema: \" + tool.function.parameters | tojson(ensure_ascii=False) + \"\\n\\n\" }}\n    {%- endfor %}\n    {{- '**Note**: For each function call, output the function name and arguments within the following XML format:\\n<longcat_tool_call>{function-name}\\n<longcat_arg_key>{arg-key-1}</longcat_arg_key>\\n<longcat_arg_value>{arg-value-1}</longcat_arg_value>\\n<longcat_arg_key>{arg-key-2}</longcat_arg_key>\\n<longcat_arg_value>{arg-value-2}</longcat_arg_value>\\n...\\n</longcat_tool_call>\\n' }}\n    {{- \"</longcat_tool_declare>\"-}}\n    {%- for idx in range(messages|length - 1) %}\n        {%- set msg = messages[idx] %}\n        {%- if msg.role == 'assistant' and not msg.tool_calls %}\n            {%- set ns.last_query_index = idx %}\n        {%- endif %}\n    {%- endfor%}\n{%- endif %}\n\n{%- for msg in messages %}\n    {%- if msg.role == \"system\" %}\n        {{- \"<longcat_system>\" + msg.content }}\n    {%- elif msg.role == \"user\" %}\n        {{- \"<longcat_user>\" }}\n        {%- if msg[\"files\"] %}\n            {{- '<longcat_files>\\n' ~ msg.files | tojson(indent=2) ~ '\\n</longcat_files>' }}\n        {%- endif %}\n\n        {%- if add_generation_prompt and loop.last and msg.content is string and msg.content.endswith(\"<longcat_img_start>\") %}\n            {%- set ns.suffix_to_move = \"<longcat_img_start>\" %}\n            {{- msg.content[:-19] }}\n        {%- elif add_generation_prompt and loop.last and msg.content is string and msg.content.endswith(\"<longcat_audiogen_start>\") %}\n            {%- set ns.suffix_to_move = \"<longcat_audiogen_start>\" %}\n            {{- msg.content[:-24] }}\n        {%- else %}\n            {{- msg.content }}\n        {%- endif %}\n\n    {%- elif msg.role == \"assistant\" %}\n        {{- \"<longcat_assistant>\" }}\n        {%- if enable_thinking == true and msg.reasoning_content and ns.tool_types != [] and loop.index0 > ns.last_query_index %}\n            {{- \"\\n<longcat_think>\\n\" ~ msg.reasoning_content ~ \"\\n</longcat_think>\\n\" }}\n        {%- endif %}\n        {%- if msg.content%}\n            {{- msg.content }}\n        {%- endif %}\n        {%- if msg.tool_calls %}\n            {%- for tool_call in msg.tool_calls -%}\n                {{- \"<longcat_tool_call>\" ~ tool_call.function.name ~ \"\\n\" -}}\n                {% set _args = tool_call.function.arguments %}\n                {% for k, v in _args.items() %}\n                    {{- \"<longcat_arg_key>\" ~ k ~ \"</longcat_arg_key>\\n\" -}}\n                    {{- \"<longcat_arg_value>\" ~ (v if v is string else v | tojson(ensure_ascii=False)) ~ \"</longcat_arg_value>\\n\" -}}\n                {% endfor %}\n                {{- \"</longcat_tool_call>\\n\" }}\n            {%- endfor %}\n        {%- endif %}\n        {{- \"</longcat_s>\" -}}\n    {%- elif msg.role == \"tool\" %}\n        {%- if messages[loop.index0 - 1].role != \"tool\"%}\n            {{- \"<longcat_user>\" -}}\n        {%- endif %}\n        {{- \"<longcat_tool_response>\" ~ msg.content ~ \"</longcat_tool_response>\"-}}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {%- if enable_thinking == true %}\n        {{- \" /think_on\" }}\n        {%- if thinking_budget %}\n            {%- if thinking_budget < 1024 %}\n                {%- set thinking_budget = 1024 %}\n            {%- endif%}\n            {{- \"\\nthinking_budget: < \" ~ thinking_budget ~ \".\"}}\n        {%- endif %}\n        {{- \" <longcat_assistant><longcat_think>\\n\"}}\n    {%- elif enable_thinking == false %}\n        {{- \" /think_off <longcat_assistant><longcat_think>\\n\\n</longcat_think>\\n\" }}\n    {%- else %}\n        {{- \"<longcat_assistant>\" ~ ns.suffix_to_move }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</longcat_s>",
+  "model_max_length": 131072,
+  "pad_token": "<longcat_pad>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "BloomTokenizer",
+  "unk_token": "<longcat_unk>",
+  "image_start_token": "<longcat_img_start>",
+  "image_end_token": "<longcat_img_end>",
+  "image_pad_token": "<longcat_img_pad>",
+  "image_newline_token": "<longcat_img_newline>",
+  "audio_start_token": "<longcat_audio_start>",
+  "audio_end_token": "<longcat_audio_end>",
+  "audio_pad_token": "<longcat_audio_pad>"
+}