hif1000 commited on Feb 6

Commit

06ea793

verified ·

1 Parent(s): 5519a8a

Upload folder using huggingface_hub

Browse files

Files changed (35) hide show

.gitattributes +3 -0
LICENSE +63 -0
NOTICE +9 -0
README.md +243 -6
__pycache__/configuration_superlinear_exp.cpython-312.pyc +0 -0
__pycache__/configuration_superlinear_exp.cpython-313.pyc +0 -0
__pycache__/modeling_superlinear_exp.cpython-312.pyc +3 -0
__pycache__/modeling_superlinear_exp.cpython-313.pyc +3 -0
__pycache__/moe.cpython-313.pyc +0 -0
chat_template.jinja +204 -0
config.json +80 -0
configuration_superlinear_exp.py +341 -0
generation_config.json +11 -0
model-00001-of-00016.safetensors +3 -0
model-00002-of-00016.safetensors +3 -0
model-00003-of-00016.safetensors +3 -0
model-00004-of-00016.safetensors +3 -0
model-00005-of-00016.safetensors +3 -0
model-00006-of-00016.safetensors +3 -0
model-00007-of-00016.safetensors +3 -0
model-00008-of-00016.safetensors +3 -0
model-00009-of-00016.safetensors +3 -0
model-00010-of-00016.safetensors +3 -0
model-00011-of-00016.safetensors +3 -0
model-00012-of-00016.safetensors +3 -0
model-00013-of-00016.safetensors +3 -0
model-00014-of-00016.safetensors +3 -0
model-00015-of-00016.safetensors +3 -0
model-00016-of-00016.safetensors +3 -0
model.safetensors.index.json +414 -0
modeling_superlinear_exp.py +0 -0
moe.py +890 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+__pycache__/modeling_superlinear_exp.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
+__pycache__/modeling_superlinear_exp.cpython-313.pyc filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,63 @@

+NVIDIA Open Model License Agreement
+Last Modified: October 24, 2025
+This NVIDIA Open Model License Agreement (the "Agreement") is a legal agreement between the Legal Entity You represent, or if no entity is identified, You and NVIDIA Corporation and its Affiliates ("NVIDIA") and governs Your use of the Models that NVIDIA provides to You under this Agreement. NVIDIA and You are each a "party" and collectively the "parties."
+NVIDIA models released under this Agreement are intended to be used permissively and enable the further development of AI technologies. Subject to the terms of this Agreement, NVIDIA confirms that:
+- Models are commercially usable.
+- You are free to create and distribute Derivative Models.
+- NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models.
+By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Model, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement.
+Definitions. The following definitions apply to this Agreement:
+"Derivative Model" means all (a) modifications to the Model, (b) works based on the Model, and (c) any other derivative works of the Model. An output is not a Derivative Model.
+"Legal Entity" means the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares, or (c) beneficial ownership of such entity.
+"Model" means the machine learning model, software, checkpoints, learnt weights, algorithms, parameters, configuration files and documentation shared under this Agreement.
+"NVIDIA Cosmos Model" means a multimodal Model shared under this Agreement
+"Special-Purpose Model" means a Model that is only competent in a narrow set of purpose-specific tasks and should not be used for unintended or general-purpose applications
+"You" or "Your" means an individual or Legal Entity exercising permissions granted by this Agreement.
+Conditions for Use, License Grant, AI Ethics and IP Ownership.
+2.1 Conditions for Use. The Model and any Derivative Model are subject to additional terms as described in Section 2 and Section 3 of this Agreement and govern Your use. If You institute copyright or patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model or a Derivative Model constitutes direct or contributory copyright or patent infringement, then any licenses granted to You under this Agreement for that Model or Derivative Model will terminate as of the date such litigation is filed. If You bypass, disable, reduce the efficacy of, or circumvent any technical limitation, safety guardrail or associated safety guardrail hyperparameter, encryption, security, digital rights management, or authentication mechanism (collectively "Guardrail") contained in the Model without a substantially similar Guardrail appropriate for your use case, your rights under this Agreement will automatically terminate. NVIDIA may indicate in relevant documentation that a Model is a Special-Purpose Model. NVIDIA may update this Agreement to comply with legal and regulatory requirements at any time and You agree to either comply with any updated license or cease Your copying, use, and distribution of the Model and any Derivative Model.
+2.2 License Grant. The rights granted herein are explicitly conditioned on Your full compliance with the terms of this Agreement. Subject to the terms and conditions of this Agreement, NVIDIA hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, revocable (as stated in Section 2.1) license to publicly perform, publicly display, reproduce, use, create derivative works of, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution) and import the Model.
+2.3 AI Ethics. Use of the Models under the Agreement must be consistent with NVIDIA's Trustworthy AI terms found at https://www.nvidia.com/en-us/agreements/trustworthy-ai/terms/.
+2.4 NVIDIA owns the Model and any Derivative Models created by NVIDIA. Subject to NVIDIA's underlying ownership rights in the Model or its Derivative Models, You are and will be the owner of Your Derivative Models. NVIDIA claims no ownership rights in outputs. You are responsible for outputs and their subsequent uses. Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Model and (b) no other license or right is granted to you by implication, estoppel or otherwise.
+Redistribution. You may reproduce and distribute copies of the Model or Derivative Models thereof in any medium, with or without modifications, provided that You meet the following conditions:
+3.1 If you distribute the Model, You must give any other recipients of the Model a copy of this Agreement and include the following attribution notice within a "Notice" text file with such copies: "Licensed by NVIDIA Corporation under the NVIDIA Open Model License";
+3.2 If you distribute or make available a NVIDIA Cosmos Model, or a product or service (including an AI model) that contains or uses a NVIDIA Cosmos Model, use a NVIDIA Cosmos Model to create a Derivative Model, or use a NVIDIA Cosmos Model or its outputs to create, train, fine tune, or otherwise improve an AI model, you will include "Built on NVIDIA Cosmos" on a related website, user interface, blogpost, about page, or product documentation; and
+3.3 You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Models as a whole, provided Your use, reproduction, and distribution of the Model otherwise complies with the conditions stated in this Agreement.
+Separate Components. The Models may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as an Open Source Software License or other third-party license. The components are subject to the applicable other licenses, including any proprietary notices, disclaimers, requirements and extended use rights; except that this Agreement will prevail regarding the use of third-party Open Source Software License, unless a third-party Open Source Software License requires its license terms to prevail. "Open Source Software License" means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (https://opensource.org), Free Software Foundation (https://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (https://www.spdx.org).
+Trademarks. This Agreement does not grant permission to use the trade names, trademarks, service marks, or product names of NVIDIA, except as required for reasonable and customary use in describing the origin of the Model and reproducing the content of the "Notice" text file.
+Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, NVIDIA provides the Model on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for reviewing Model documentation, including any Special-Purpose Model limitations, and determining the appropriateness of using or redistributing the Model, Derivative Models and outputs. You assume any risks associated with Your exercise of permissions under this Agreement.
+Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, will NVIDIA be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this Agreement or out of the use or inability to use the Model, Derivative Models or outputs (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if NVIDIA has been advised of the possibility of such damages.
+Indemnity. You will indemnify and hold harmless NVIDIA from and against any claim by any third party arising out of or related to your use or distribution of the Model, Derivative Models or outputs.
+Feedback. NVIDIA appreciates your feedback, and You agree that NVIDIA may use it without restriction or compensation to You.
+Governing Law. This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; except that, either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+Trade and Compliance. You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. These laws include restrictions on destinations, end-users and end-use.
+Version Release Date: October 24, 2025

NOTICE ADDED Viewed

	@@ -0,0 +1,9 @@

+Licensed by NVIDIA Corporation under the NVIDIA Open Model License
+This model (superlinear-exp-v0.1) is a Derivative Model based on NVIDIA Nemotron-3-Nano-30B-A3B-BF16.
+Upstream model: https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+Modifications by: Concavity AI (Yufeng Huang)
+- Replaced standard attention layers with Superlinear attention layers
+- Fine-tuned on long-context retrieval tasks

README.md CHANGED Viewed

@@ -1,6 +1,243 @@
----
-license: other
-license_name: nvidia-open-model-license
-license_link: >-
-  https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license
----

+---
+license: other
+license_name: nvidia-open-model-license
+license_link: LICENSE
+library_name: transformers
+pipeline_tag: text-generation
+tags:
+  - long-context
+  - superlinear-attention
+  - subquadratic
+  - causal-lm
+base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+---
+# Superlinear-Exp-v0.1
+**Superlinear Multi-Step Attention** — a subquadratic attention mechanism that preserves random context access (structural non-exclusion) for extremely long sequences.
+This is an experimental release demonstrating the Superlinear attention architecture integrated into a modified [nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16) hybrid model.
+> **WARNING (Security):** This model requires `trust_remote_code=True`, which executes Python code from this repository. Review the code before running in sensitive environments.
+## Model Description
+Superlinear attention reformulates causal self-attention as a multi-step search problem:
+1. **Accumulation** — Efficiently processes the sequence and produces per-position representatives (via Mamba-2 layers in the hybrid architecture).
+2. **Span Search** — Scores a sublinear number of candidate spans using learned routing, then selects top-k spans per query.
+3. **Span Attention** — Computes standard token-level attention within the selected contiguous spans.
+4. **Combination** — Produces outputs using softmax-weighted gating over span attention outputs.
+In the baseline N=2 implementation, both span-search and span-attention scale as **O(L^(3/2))**, enabling practical inference at multi-million-token context lengths where dense attention becomes prohibitive.
+**Key property:** *Random context access* (structural non-exclusion) — any eligible token position can be selected by the content-dependent routing mechanism; no fixed sparsity pattern permanently excludes positions.
+## Quickstart
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained(
+    "concavity-ai/superlinear-exp-v0.1",
+    trust_remote_code=True
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "concavity-ai/superlinear-exp-v0.1",
+    torch_dtype=torch.float16,
+    device_map="cuda",
+    trust_remote_code=True,
+)
+messages = [{"role": "user", "content": "Explain the Transformer architecture."}]
+inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
+output = model.generate(inputs, max_new_tokens=1000, do_sample=True, temperature=0.1, top_p=0.99)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+## Dependencies
+This model uses custom Python code (`trust_remote_code=True`) and CUDA extensions.
+### Recommended: follow the Superlinear repo install
+The simplest supported path is to use the installation flow from the Superlinear repo (it pins a known-good CUDA toolchain and builds `mamba-ssm[causal-conv1d]` from source to avoid wheel/ABI mismatches):
+https://github.com/concavity-ai/superlinear#installation
+Copy/paste one-liner (from the Superlinear repo root):
+```bash
+conda env create -f environment.yml \
+  && conda run -n superlinear pip install torch --index-url https://download.pytorch.org/whl/cu128 \
+  && conda run -n superlinear pip install -e ".[server,model]" \
+  && conda run -n superlinear bash -lc 'CUDA_HOME="$CONDA_PREFIX" pip install "mamba-ssm[causal-conv1d]" --no-build-isolation --no-cache-dir --no-binary :all:'
+```
+### Optional: pip-only (if `mamba-ssm` already works in your env)
+If `python -c "import mamba_ssm, causal_conv1d"` already succeeds in the environment you’ll run inference in, you already have a working PyTorch/CUDA pairing for the extension in that environment — you should not need to reinstall PyTorch.
+Install the remaining Python deps + Superlinear:
+```bash
+pip install -U "transformers<5" accelerate safetensors
+pip install -U vllm triton
+# Superlinear kernels (span-attention)
+pip install -U git+https://github.com/concavity-ai/superlinear.git
+```
+### Building `mamba-ssm` from source (only if needed)
+If you must build `mamba-ssm[causal-conv1d]` yourself, you need a CUDA toolkit with `nvcc` and `CUDA_HOME` pointing at it (example: `/usr/local/cuda`):
+```bash
+CUDA_HOME=/usr/local/cuda \
+  pip install -U "mamba-ssm[causal-conv1d]" \
+  --no-build-isolation --no-cache-dir --no-binary :all:
+```
+## Recommended Inference Settings
+For long-context inference with the Superlinear attention mechanism, use the following configuration:
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    "concavity-ai/superlinear-exp-v0.1",
+    # Attention implementation
+    _attn_implementation='block-span-gqa',
+    decode_kernel='staged-gqa',
+    # Performance optimizations
+    enable_cuda_graph=True,
+    enable_shared_fused_moe=True,
+    # Superlinear attention hyperparameters
+    span_attention_sw_index=65,           # Local window boundary index
+    span_attention_num_spans=3,           # Top-k spans per query
+    span_attention_backward_factor=3,     # Backward span extent multiplier
+    span_attention_forward_factor=1,      # Forward span extent multiplier
+    span_attention_search_power=0.55,     # Search exponent (controls anchor budget)
+    span_attention_span_power=0.55,       # Span exponent (controls span scale)
+    torch_dtype=torch.float16,
+    device_map="cuda",
+    trust_remote_code=True,
+)
+```
+### Hyperparameter Notes
+| Parameter | Description | Typical Value |
+|-----------|-------------|---------------|
+| `span_attention_num_spans` | Number of routed spans selected per query (top-k) | 2 or 3 |
+| `span_attention_backward_factor` | Backward extent of each span relative to base scale | 2–4 |
+| `span_attention_forward_factor` | Forward extent of each span relative to base scale | 0–2 |
+| `span_attention_search_power` | Exponent controlling the number of candidate anchors | 0.5–0.667 |
+| `span_attention_span_power` | Exponent controlling span length scaling | 0.5–0.667 |
+**Sliding window length from `span_attention_sw_index`:** Internally, the kernels compute the sliding-window length as:
+```text
+window_len = floor((sw_index + 1) ** (1 / search_power)) - 1
+```
+We parameterize the local sliding window using `sw_index` (a stride/stripe index) rather than specifying `window_len` directly. This keeps the sliding-window boundary aligned with the same index space used by span search, so span-search begins immediately after the sliding-window region and avoids gaps between local attention and routed spans.
+Example: with `search_power=0.55` and `sw_index=65`,
+```text
+window_len = floor(66 ** (1 / 0.55)) - 1 = 2032
+```
+## Hardware Requirements
+- **GPU:** NVIDIA GPU with sufficient VRAM (tested on B200 180GB)
+- **KV Cache:** ~6GB per million tokens (model-dependent)
+- **Precision:** FP16 recommended
+### Measured Throughput (Single B200, Batch=1)
+| Context Length | Prefill (tok/s) | Decode (tok/s) |
+|----------------|-----------------|----------------|
+| 1M tokens      | ~20,202         | ~109           |
+| 10M tokens     | ~5,576          | ~76            |
+*Your results may vary depending on hardware, batch size, and configuration.*
+## Intended Use
+This is an **architecture-and-systems feasibility study** release. It demonstrates that:
+1. The Superlinear attention mechanism is structurally random-context-access-preserving
+2. The architecture achieves asymptotically subquadratic attention complexity
+3. The resulting irregular span pattern can be implemented with practical performance at very long context lengths
+### Limitations
+- **Not a comprehensive quality study:** We do not present extensive ablations or claim state-of-the-art accuracy on benchmarks.
+- **Limited evaluation:** Initial validation focused on NIAH (Needle In A Haystack) retrieval task and throughput measurements.
+- **Experimental:** This release is intended for research and experimentation, not production use.
+- **Memory:** Full KV cache must be retained for random context access; memory usage scales with context length.
+## What's in This Repository
+```
+├── config.json                     # Model configuration
+├── generation_config.json          # Default generation settings
+├── tokenizer.json                  # Tokenizer
+├── tokenizer_config.json
+├── special_tokens_map.json
+├── chat_template.jinja             # Chat template
+├── configuration_superlinear_exp.py  # Custom config class
+├── modeling_superlinear_exp.py     # Custom model implementation
+├── moe.py                          # MoE components
+├── model-*.safetensors             # Model weights (16 shards)
+├── model.safetensors.index.json    # Weight index
+├── LICENSE                         # NVIDIA Open Model License
+├── NOTICE                          # Required attribution
+└── README.md                       # This file
+```
+## License
+### Model Weights
+This model is a derivative of [nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16) and is distributed under the **NVIDIA Open Model License Agreement**.
+See [LICENSE](LICENSE) for the full license text.
+Use of this model must be consistent with [NVIDIA's Trustworthy AI terms](https://www.nvidia.com/en-us/agreements/trustworthy-ai/terms/).
+### Code
+The modeling code in this repository is provided for loading and running the model. For the broader Superlinear project codebase, see [github.com/concavity-ai/superlinear](https://github.com/concavity-ai/superlinear) (Apache-2.0).
+## Attribution
+**Upstream Model:**
+- NVIDIA Nemotron-3-Nano-30B-A3B ([nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16))
+**Paper:**
+```bibtex
+@article{huang2026superlinear,
+  title={Superlinear Multi-Step Attention},
+  author={Huang, Yufeng},
+  journal={arXiv preprint arXiv:2601.18401},
+  year={2026}
+}
+```
+## Patent Notice
+Patent applications have been filed related to aspects of the methods described in this work.
+## Contact
+- Author: Yufeng Huang
+- Email: yufeng@concavity.ai
+- Organization: Concavity AI

__pycache__/configuration_superlinear_exp.cpython-312.pyc ADDED Viewed

Binary file (14.9 kB). View file

__pycache__/configuration_superlinear_exp.cpython-313.pyc ADDED Viewed

Binary file (14.9 kB). View file

__pycache__/modeling_superlinear_exp.cpython-312.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:468f989a95d55eb2d38d742756830863d7bffb25e39fb9c304a200b4d9c63b2d
+size 164921

__pycache__/modeling_superlinear_exp.cpython-313.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc47238f394aca2f6c7176e3fa059bc975b660170d0d7fc6e385e0d6f69750d
+size 168645

__pycache__/moe.cpython-313.pyc ADDED Viewed

Binary file (35.1 kB). View file

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,204 @@

+{% macro render_extra_keys(json_dict, handled_keys) %}
+    {%- if json_dict is mapping %}
+        {%- for json_key in json_dict if json_key not in handled_keys %}
+            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
+                {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
+            {%- else %}
+                {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{% endmacro %}
+{%- set enable_thinking = enable_thinking if enable_thinking is defined else True %}
+{%- set truncate_history_thinking = truncate_history_thinking if truncate_history_thinking is defined else True %}
+{%- set ns = namespace(last_user_idx = -1) %}
+{%- set loop_messages = messages %}
+{%- for m in loop_messages %}
+  {%- if m["role"] == "user" %}
+    {%- set ns.last_user_idx = loop.index0 %}
+  {%- endif %}
+{%- endfor %}
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = [] %}
+{%- endif %}
+{# Recompute last_user_idx relative to loop_messages after handling system #}
+{%- set ns = namespace(last_user_idx = -1) %}
+{%- for m in loop_messages %}
+  {%- if m["role"] == "user" %}
+    {%- set ns.last_user_idx = loop.index0 %}
+  {%- endif %}
+{%- endfor %}
+{%- if system_message is defined %}
+    {{- "<|im_start|>system\n" + system_message }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- "<|im_start|>system\n" }}
+    {%- endif %}
+{%- endif %}
+{%- if tools is iterable and tools | length > 0 %}
+    {%- if system_message is defined and system_message | length > 0 %}
+        {{- "\n\n" }}
+    {%- endif %}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n" }}
+    {{- "<tools>" }}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
+        {%- if tool.description is defined %}
+            {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
+        {%- endif %}
+        {{- '\n<parameters>' }}
+        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
+            {%- for param_name, param_fields in tool.parameters.properties|items %}
+                {{- '\n<parameter>' }}
+                {{- '\n<name>' ~ param_name ~ '</name>' }}
+                {%- if param_fields.type is defined %}
+                    {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
+                {%- endif %}
+                {%- if param_fields.description is defined %}
+                    {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
+                {%- endif %}
+                {%- if param_fields.enum is defined %}
+                    {{- '\n<enum>' ~ (param_fields.enum | tojson | safe) ~ '</enum>' }}
+                {%- endif %}
+                {%- set handled_keys = ['name', 'type', 'description', 'enum'] %}
+                {{- render_extra_keys(param_fields, handled_keys) }}
+                {{- '\n</parameter>' }}
+            {%- endfor %}
+        {%- endif %}
+        {% set handled_keys = ['type', 'properties', 'required'] %}
+        {{- render_extra_keys(tool.parameters, handled_keys) }}
+        {%- if tool.parameters is defined and tool.parameters.required is defined %}
+            {{- '\n<required>' ~ (tool.parameters.required | tojson | safe) ~ '</required>' }}
+        {%- endif %}
+        {{- '\n</parameters>' }}
+        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
+        {{- render_extra_keys(tool, handled_keys) }}
+        {{- '\n</function>' }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+{%- endif %}
+{%- if system_message is defined %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in loop_messages %}
+    {%- if message.role == "assistant" %}
+        {# Add reasoning content in to content field for unified processing below. #}
+        {%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
+            {%- set content = "<think>\n" ~ message.reasoning_content ~ "\n</think>\n" ~ (message.content | default('', true)) %}
+        {%- else %}
+            {%- set content = message.content | default('', true) %}
+            {%- if content is string -%}
+                {# Allow downstream logic to to take care of broken thought, only handle coherent reasoning here. #}
+                {%- if '<think>' not in content and '</think>' not in content -%}
+                    {%- set content = "<think></think>" ~ content -%}
+                {%- endif -%}
+            {%- else -%}
+                {%- set content = content -%}
+            {%- endif -%}
+        {%- endif %}
+        {%- if message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
+            {# Assistant message has tool calls. #}
+            {{- '<|im_start|>assistant\n' }}
+                {%- set include_content = not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
+                {%- if content is string and content | trim | length > 0 %}
+                    {%- if include_content %}
+                        {{- (content | trim) ~ '\n' -}}
+                    {%- else %}
+                        {%- set c = (content | string) %}
+                        {%- if '</think>' in c %}
+                            {# Keep only content after the last closing think. Also generation prompt causes this. #}
+                            {%- set c = c.split('</think>')[-1] %}
+                        {%- elif '<think>' in c %}
+                            {# If <think> was opened but never closed, drop the trailing think segment #}
+                            {%- set c = c.split('<think>')[0] %}
+                        {%- endif %}
+                        {%- set c = "<think></think>" ~ c | trim %}
+                        {%- if c | length > 0 %}
+                            {{- c ~ '\n' -}}
+                        {%- endif %}
+                    {%- endif %}
+                {%- else %}
+                    {{- "<think></think>" -}}
+                {%- endif %}
+                {%- for tool_call in message.tool_calls %}
+                    {%- if tool_call.function is defined %}
+                        {%- set tool_call = tool_call.function %}
+                    {%- endif %}
+                    {{- '<tool_call>\n<function=' ~ tool_call.name ~ '>\n' -}}
+                        {%- if tool_call.arguments is defined %}
+                            {%- for args_name, args_value in tool_call.arguments|items %}
+                                {{- '<parameter=' ~ args_name ~ '>\n' -}}
+                                    {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                                {{- args_value ~ '\n</parameter>\n' -}}
+                            {%- endfor %}
+                        {%- endif %}
+                    {{- '</function>\n</tool_call>\n' -}}
+                {%- endfor %}
+                {{- '<|im_end|>\n' }}
+        {%- else %}
+            {# Assistant message doesn't have tool calls. #}
+            {%- if not (truncate_history_thinking and loop.index0 < ns.last_user_idx) %}
+                {{- '<|im_start|>assistant\n' ~ (content | default('', true) | string | trim) ~ '<|im_end|>\n' }}
+            {%- else %}
+                {%- set c = (content | default('', true) | string) %}
+                {%- if '<think>' in c and '</think>' in c %}
+                    {%- set c = "<think></think>" ~ c.split('</think>')[-1] %}
+                {%- endif %}
+                {%- set c = c | trim %}
+                {%- if c | length > 0 %}
+                    {{- '<|im_start|>assistant\n' ~ c ~ '<|im_end|>\n' }}
+                {%- else %}
+                    {{- '<|im_start|>assistant\n<|im_end|>\n' }}
+                {%- endif %}
+            {%- endif %}
+        {%- endif %}
+    {%- elif message.role == "user" or message.role == "system" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- set content = message.content | string %}
+        {{- content }}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>\n' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {%- if enable_thinking %}
+        {{- '<|im_start|>assistant\n<think>\n' }}
+    {%- else %}
+        {{- '<|im_start|>assistant\n<think></think>' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+  "architectures": [
+    "SuperlinearExpForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_superlinear_exp.SuperlinearExpConfig",
+    "AutoModel": "modeling_superlinear_exp.SuperlinearExpForCausalLM",
+    "AutoModelForCausalLM": "modeling_superlinear_exp.SuperlinearExpForCausalLM"
+  },
+  "bos_token_id": 1,
+  "chunk_size": 128,
+  "conv_kernel": 4,
+  "decode_kernel": "staged-gqa",
+  "enable_cuda_graph": true,
+  "enable_shared_fused_moe": true,
+  "eos_token_id": 2,
+  "expand": 2,
+  "head_dim": 128,
+  "hidden_dropout": 0.0,
+  "hidden_size": 2688,
+  "hybrid_override_pattern": "MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME",
+  "initializer_range": 0.02,
+  "intermediate_size": 1856,
+  "layer_norm_epsilon": 1e-05,
+  "mamba_head_dim": 64,
+  "mamba_hidden_act": "silu",
+  "mamba_num_heads": 64,
+  "mamba_proj_bias": false,
+  "mamba_ssm_cache_dtype": "float32",
+  "max_position_embeddings": 262144,
+  "mlp_bias": false,
+  "mlp_hidden_act": "relu2",
+  "model_type": "superlinear-exp",
+  "moe_intermediate_size": 1856,
+  "moe_shared_expert_intermediate_size": 3712,
+  "n_group": 1,
+  "n_groups": 8,
+  "n_routed_experts": 128,
+  "n_shared_experts": 1,
+  "norm_eps": 1e-05,
+  "norm_topk_prob": true,
+  "num_attention_heads": 32,
+  "num_experts_per_tok": 6,
+  "num_hidden_layers": 52,
+  "num_key_value_heads": 2,
+  "num_logits_to_keep": 1,
+  "pad_token_id": 0,
+  "partial_rotary_factor": 1.0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": false,
+  "rope_theta": 10000,
+  "routed_scaling_factor": 2.5,
+  "sliding_window": null,
+  "span_attention_backward_factor": 3.0,
+  "span_attention_forward_factor": 1.0,
+  "span_attention_inv_search_power_int": null,
+  "span_attention_num_spans": 3,
+  "span_attention_search_power": 0.55,
+  "span_attention_span_power": 0.55,
+  "span_attention_sw_index": 65,
+  "ssm_state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "topk_group": 1,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_mamba_kernels": true,
+  "vocab_size": 131072
+}

configuration_superlinear_exp.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SuperlinearExp model configuration"""
+import math
+import re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class SuperlinearExpConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SuperlinearExpModel`]. It is used to instantiate a
+    SuperlinearExp model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the SuperlinearExp-v0.1 model.
+    [todo](todo)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the SuperlinearExp model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`SuperlinearExpModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+    model_type = "superlinear-exp"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02, # nemo: init_method_std
+        layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0, # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128, # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=128,
+        rescale_prenorm_residual=True,
+        span_attention_sw_index=65,
+        span_attention_num_spans=3,
+        span_attention_backward_factor: float = 3.0,
+        span_attention_forward_factor: float = 1.0,
+        span_attention_span_power: float = 0.55,
+        span_attention_search_power: float | None = 0.55,
+        span_attention_inv_search_power_int: int | None = None,
+        decode_kernel="staged-gqa",
+        enable_cuda_graph: bool = True,
+        enable_shared_fused_moe: bool = True,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        moe_intermediate_size=7688,
+        moe_shared_expert_intermediate_size=7688,
+        num_experts_per_tok=2,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        norm_topk_prob=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+        # Span attention configuration
+        self.span_attention_sw_index = span_attention_sw_index
+        self.span_attention_num_spans = span_attention_num_spans
+        self.span_attention_backward_factor = float(span_attention_backward_factor)
+        self.span_attention_forward_factor = float(span_attention_forward_factor)
+        self.span_attention_span_power = float(span_attention_span_power)
+        if not math.isfinite(self.span_attention_backward_factor) or self.span_attention_backward_factor < 0.0:
+            raise ValueError(
+                "span_attention_backward_factor must be finite and >= 0 "
+                f"(got {self.span_attention_backward_factor})"
+            )
+        if not math.isfinite(self.span_attention_forward_factor) or self.span_attention_forward_factor < 0.0:
+            raise ValueError(
+                "span_attention_forward_factor must be finite and >= 0 "
+                f"(got {self.span_attention_forward_factor})"
+            )
+        if (self.span_attention_backward_factor + self.span_attention_forward_factor) <= 0.0:
+            raise ValueError(
+                "span_attention_backward_factor + span_attention_forward_factor must be > 0 "
+                f"(got {self.span_attention_backward_factor + self.span_attention_forward_factor})"
+            )
+        if not math.isfinite(self.span_attention_span_power) or not (0.0 < self.span_attention_span_power < 1.0):
+            raise ValueError(
+                "span_attention_span_power must be finite and in (0, 1) "
+                f"(got {self.span_attention_span_power})"
+            )
+        # Stripe power parameters (search stripes + sliding window width).
+        if (span_attention_inv_search_power_int is None) == (span_attention_search_power is None):
+            raise ValueError(
+                "Provide exactly one of span_attention_inv_search_power_int or span_attention_search_power"
+            )
+        if span_attention_inv_search_power_int is not None:
+            inv_n = int(span_attention_inv_search_power_int)
+            if inv_n not in (2, 3, 4, 5, 6):
+                raise ValueError(
+                    "span_attention_inv_search_power_int must be one of (2, 3, 4, 5, 6) "
+                    f"(got {span_attention_inv_search_power_int})"
+                )
+            self.span_attention_inv_search_power_int = inv_n
+            self.span_attention_search_power = None
+            derived_p = 1.0 / float(inv_n)
+        else:
+            p = float(span_attention_search_power)
+            if not math.isfinite(p) or not (0.0 < p < 1.0):
+                raise ValueError(
+                    "span_attention_search_power must be finite and in (0, 1) "
+                    f"(got {span_attention_search_power})"
+                )
+            self.span_attention_inv_search_power_int = None
+            self.span_attention_search_power = p
+            derived_p = p
+        # Critical coverage constraint (see 47.1_generalized_stripe_power_design.md):
+        # span_power >= 1 - search_power.
+        if self.span_attention_span_power + derived_p < 1.0:
+            raise ValueError(
+                "span_attention_span_power must satisfy span_power >= 1 - search_power "
+                f"(got span_power={self.span_attention_span_power}, search_power={derived_p})"
+            )
+        if decode_kernel not in (None, "staged", "staged-gqa"):
+            raise ValueError(
+                f"Invalid decode_kernel={decode_kernel!r}; expected one of None, 'staged', 'staged-gqa'."
+            )
+        self.decode_kernel = decode_kernel
+        self.enable_cuda_graph = enable_cuda_graph
+        self.enable_shared_fused_moe = enable_shared_fused_moe
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, "hybrid_override_pattern must have the same length as num_hidden_layers"
+        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), "hybrid_override_pattern must only contain characters 'M', '*', or '-'"
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.routed_scaling_factor = routed_scaling_factor
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.norm_topk_prob = norm_topk_prob
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba" if self.hybrid_override_pattern[i] == "M" else
+            "attention" if self.hybrid_override_pattern[i] == "*" else
+            "mlp" if self.hybrid_override_pattern[i] == "-" else "moe"
+            for i in range(self.num_hidden_layers)]

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": [
+    2,
+    11
+  ],
+  "pad_token_id": 0,
+  "transformers_version": "4.57.3"
+}

model-00001-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97abe5c9503b8f60e502d509898c658fb8c366b32308463e6fc2cd22d9533973
+size 4654236816

model-00002-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:593e019ec60819249448c91e647abff72d9171a212d1664c0957dd9b2cb94bad
+size 4136509104

model-00003-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:595c430f59919f62e1c3ba70596e853f004bb20710cbbe5b679ba8b22ef4c720
+size 3949593672

model-00004-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fb6e9c083f14e67e0a8c6f1c90dd73d77fd0dadbc3a76203ead3f5e3b67cf91
+size 4213999864

model-00005-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40bc0337b6b2f619e9641278ed97058cc099ca001d4d3ff80de0d97a886b5411
+size 3949593672

model-00006-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6104484169ae2ffa3b919187a176e762414dfe48a6d858461db935278441ffcd
+size 4136509104

model-00007-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0bb006d8d803aa869ad584a8c6e5926b398f0c04ddc1740d842d3d6a369f8ee
+size 3872102896

model-00008-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:485cf181874c6b9cb817407bab34b8334e5515329323f9683d7e93b25679c331
+size 4136509096

model-00009-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b49942cf27c63bec56f5484b113c50442735a3c5885890b43f54cb0509ee555
+size 3949593672

model-00010-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cd5c8dfe3bba15d056458fca0413511f576c3fae10b9f2daae88d247b17db1b
+size 4145181008

model-00011-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71e77335c4a7af5d8859a2108039a4f30bebcaf278ebd4b56f9dd8f403495abb
+size 4018412536

model-00012-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad603897bb6b7c951bd69e9f89009521909163c8a6195c4fdab00b545fd99c03
+size 4067690240

model-00013-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e79a65b74d9ad6261748131aa86a7d84833bdf5d7609816219e4c7d5bd4c6ec0
+size 3949593672

model-00014-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f97e980088587a0b273d4709aad56b9e33b31e6887b483941f67fa06c0a2ee4
+size 4059018320

model-00015-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:684690b0a3770d79e2466e26fc5d75930572dea5e3b1e42e3aeaff49bcbb5566
+size 3949593656

model-00016-of-00016.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96d2eda64709c79a4d25ebc2d96d8abe118bbda0150f7874add27cf9695db7f2
+size 2099910896

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,414 @@

+{
+  "metadata": {
+    "total_size": 63288001152
+  },
+  "weight_map": {
+    "backbone.embeddings.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.A_log": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.D": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.conv1d.bias": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.conv1d.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.dt_bias": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.in_proj.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.norm.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.mixer.out_proj.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.0.norm.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.1.mixer.experts.down_proj.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.1.mixer.experts.up_proj.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.1.mixer.gate.e_score_correction_bias": "model-00001-of-00016.safetensors",
+    "backbone.layers.1.mixer.gate.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.1.mixer.shared_experts.down_proj.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.1.mixer.shared_experts.up_proj.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.1.norm.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.10.mixer.experts.down_proj.weight": "model-00001-of-00016.safetensors",
+    "backbone.layers.10.mixer.experts.up_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.10.mixer.gate.e_score_correction_bias": "model-00002-of-00016.safetensors",
+    "backbone.layers.10.mixer.gate.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.10.mixer.shared_experts.down_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.10.mixer.shared_experts.up_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.10.norm.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.A_log": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.D": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.conv1d.bias": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.conv1d.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.dt_bias": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.in_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.norm.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.mixer.out_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.11.norm.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.12.mixer.k_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.12.mixer.o_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.12.mixer.q_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.12.mixer.s_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.12.mixer.v_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.12.norm.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.13.mixer.experts.down_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.13.mixer.experts.up_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.13.mixer.gate.e_score_correction_bias": "model-00002-of-00016.safetensors",
+    "backbone.layers.13.mixer.gate.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.13.mixer.shared_experts.down_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.13.mixer.shared_experts.up_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.13.norm.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.A_log": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.D": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.conv1d.bias": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.conv1d.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.dt_bias": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.in_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.norm.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.mixer.out_proj.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.14.norm.weight": "model-00002-of-00016.safetensors",
+    "backbone.layers.15.mixer.experts.down_proj.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.15.mixer.experts.up_proj.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.15.mixer.gate.e_score_correction_bias": "model-00003-of-00016.safetensors",
+    "backbone.layers.15.mixer.gate.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.15.mixer.shared_experts.down_proj.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.15.mixer.shared_experts.up_proj.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.15.norm.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.A_log": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.D": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.conv1d.bias": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.conv1d.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.dt_bias": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.in_proj.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.norm.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.mixer.out_proj.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.16.norm.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.17.mixer.experts.down_proj.weight": "model-00003-of-00016.safetensors",
+    "backbone.layers.17.mixer.experts.up_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.17.mixer.gate.e_score_correction_bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.17.mixer.gate.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.17.mixer.shared_experts.down_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.17.mixer.shared_experts.up_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.17.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.A_log": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.D": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.conv1d.bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.conv1d.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.dt_bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.in_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.mixer.out_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.18.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.19.mixer.k_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.19.mixer.o_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.19.mixer.q_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.19.mixer.s_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.19.mixer.v_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.19.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.A_log": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.D": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.conv1d.bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.conv1d.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.dt_bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.in_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.mixer.out_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.2.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.20.mixer.experts.down_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.20.mixer.experts.up_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.20.mixer.gate.e_score_correction_bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.20.mixer.gate.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.20.mixer.shared_experts.down_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.20.mixer.shared_experts.up_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.20.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.A_log": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.D": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.conv1d.bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.conv1d.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.dt_bias": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.in_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.mixer.out_proj.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.21.norm.weight": "model-00004-of-00016.safetensors",
+    "backbone.layers.22.mixer.experts.down_proj.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.22.mixer.experts.up_proj.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.22.mixer.gate.e_score_correction_bias": "model-00005-of-00016.safetensors",
+    "backbone.layers.22.mixer.gate.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.22.mixer.shared_experts.down_proj.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.22.mixer.shared_experts.up_proj.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.22.norm.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.A_log": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.D": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.conv1d.bias": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.conv1d.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.dt_bias": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.in_proj.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.norm.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.mixer.out_proj.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.23.norm.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.24.mixer.experts.down_proj.weight": "model-00005-of-00016.safetensors",
+    "backbone.layers.24.mixer.experts.up_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.24.mixer.gate.e_score_correction_bias": "model-00006-of-00016.safetensors",
+    "backbone.layers.24.mixer.gate.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.24.mixer.shared_experts.down_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.24.mixer.shared_experts.up_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.24.norm.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.A_log": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.D": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.conv1d.bias": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.conv1d.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.dt_bias": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.in_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.norm.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.mixer.out_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.25.norm.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.26.mixer.k_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.26.mixer.o_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.26.mixer.q_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.26.mixer.s_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.26.mixer.v_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.26.norm.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.27.mixer.experts.down_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.27.mixer.experts.up_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.27.mixer.gate.e_score_correction_bias": "model-00006-of-00016.safetensors",
+    "backbone.layers.27.mixer.gate.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.27.mixer.shared_experts.down_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.27.mixer.shared_experts.up_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.27.norm.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.A_log": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.D": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.conv1d.bias": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.conv1d.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.dt_bias": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.in_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.norm.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.mixer.out_proj.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.28.norm.weight": "model-00006-of-00016.safetensors",
+    "backbone.layers.29.mixer.experts.down_proj.weight": "model-00007-of-00016.safetensors",
+    "backbone.layers.29.mixer.experts.up_proj.weight": "model-00007-of-00016.safetensors",
+    "backbone.layers.29.mixer.gate.e_score_correction_bias": "model-00007-of-00016.safetensors",
+    "backbone.layers.29.mixer.gate.weight": "model-00007-of-00016.safetensors",
+    "backbone.layers.29.mixer.shared_experts.down_proj.weight": "model-00007-of-00016.safetensors",
+    "backbone.layers.29.mixer.shared_experts.up_proj.weight": "model-00007-of-00016.safetensors",
+    "backbone.layers.29.norm.weight": "model-00007-of-00016.safetensors",
+    "backbone.layers.3.mixer.experts.down_proj.weight": "model-00007-of-00016.safetensors",
+    "backbone.layers.3.mixer.experts.up_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.3.mixer.gate.e_score_correction_bias": "model-00008-of-00016.safetensors",
+    "backbone.layers.3.mixer.gate.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.3.mixer.shared_experts.down_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.3.mixer.shared_experts.up_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.3.norm.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.A_log": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.D": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.conv1d.bias": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.conv1d.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.dt_bias": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.in_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.norm.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.mixer.out_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.30.norm.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.31.mixer.experts.down_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.31.mixer.experts.up_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.31.mixer.gate.e_score_correction_bias": "model-00008-of-00016.safetensors",
+    "backbone.layers.31.mixer.gate.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.31.mixer.shared_experts.down_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.31.mixer.shared_experts.up_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.31.norm.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.A_log": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.D": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.conv1d.bias": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.conv1d.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.dt_bias": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.in_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.norm.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.mixer.out_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.32.norm.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.33.mixer.k_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.33.mixer.o_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.33.mixer.q_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.33.mixer.s_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.33.mixer.v_proj.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.33.norm.weight": "model-00008-of-00016.safetensors",
+    "backbone.layers.34.mixer.experts.down_proj.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.34.mixer.experts.up_proj.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.34.mixer.gate.e_score_correction_bias": "model-00009-of-00016.safetensors",
+    "backbone.layers.34.mixer.gate.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.34.mixer.shared_experts.down_proj.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.34.mixer.shared_experts.up_proj.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.34.norm.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.A_log": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.D": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.conv1d.bias": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.conv1d.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.dt_bias": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.in_proj.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.norm.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.mixer.out_proj.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.35.norm.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.36.mixer.experts.down_proj.weight": "model-00009-of-00016.safetensors",
+    "backbone.layers.36.mixer.experts.up_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.36.mixer.gate.e_score_correction_bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.36.mixer.gate.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.36.mixer.shared_experts.down_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.36.mixer.shared_experts.up_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.36.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.A_log": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.D": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.conv1d.bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.conv1d.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.dt_bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.in_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.mixer.out_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.37.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.38.mixer.experts.down_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.38.mixer.experts.up_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.38.mixer.gate.e_score_correction_bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.38.mixer.gate.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.38.mixer.shared_experts.down_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.38.mixer.shared_experts.up_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.38.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.A_log": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.D": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.conv1d.bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.conv1d.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.dt_bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.in_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.mixer.out_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.39.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.A_log": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.D": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.conv1d.bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.conv1d.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.dt_bias": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.in_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.mixer.out_proj.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.4.norm.weight": "model-00010-of-00016.safetensors",
+    "backbone.layers.40.mixer.experts.down_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.40.mixer.experts.up_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.40.mixer.gate.e_score_correction_bias": "model-00011-of-00016.safetensors",
+    "backbone.layers.40.mixer.gate.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.40.mixer.shared_experts.down_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.40.mixer.shared_experts.up_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.40.norm.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.A_log": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.D": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.conv1d.bias": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.conv1d.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.dt_bias": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.in_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.norm.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.mixer.out_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.41.norm.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.42.mixer.k_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.42.mixer.o_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.42.mixer.q_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.42.mixer.s_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.42.mixer.v_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.42.norm.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.43.mixer.experts.down_proj.weight": "model-00011-of-00016.safetensors",
+    "backbone.layers.43.mixer.experts.up_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.43.mixer.gate.e_score_correction_bias": "model-00012-of-00016.safetensors",
+    "backbone.layers.43.mixer.gate.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.43.mixer.shared_experts.down_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.43.mixer.shared_experts.up_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.43.norm.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.A_log": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.D": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.conv1d.bias": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.conv1d.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.dt_bias": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.in_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.norm.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.mixer.out_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.44.norm.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.45.mixer.experts.down_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.45.mixer.experts.up_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.45.mixer.gate.e_score_correction_bias": "model-00012-of-00016.safetensors",
+    "backbone.layers.45.mixer.gate.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.45.mixer.shared_experts.down_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.45.mixer.shared_experts.up_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.45.norm.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.A_log": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.D": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.conv1d.bias": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.conv1d.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.dt_bias": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.in_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.norm.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.mixer.out_proj.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.46.norm.weight": "model-00012-of-00016.safetensors",
+    "backbone.layers.47.mixer.experts.down_proj.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.47.mixer.experts.up_proj.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.47.mixer.gate.e_score_correction_bias": "model-00013-of-00016.safetensors",
+    "backbone.layers.47.mixer.gate.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.47.mixer.shared_experts.down_proj.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.47.mixer.shared_experts.up_proj.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.47.norm.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.A_log": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.D": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.conv1d.bias": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.conv1d.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.dt_bias": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.in_proj.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.norm.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.mixer.out_proj.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.48.norm.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.49.mixer.experts.down_proj.weight": "model-00013-of-00016.safetensors",
+    "backbone.layers.49.mixer.experts.up_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.49.mixer.gate.e_score_correction_bias": "model-00014-of-00016.safetensors",
+    "backbone.layers.49.mixer.gate.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.49.mixer.shared_experts.down_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.49.mixer.shared_experts.up_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.49.norm.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.5.mixer.k_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.5.mixer.o_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.5.mixer.q_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.5.mixer.s_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.5.mixer.v_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.5.norm.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.A_log": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.D": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.conv1d.bias": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.conv1d.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.dt_bias": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.in_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.norm.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.mixer.out_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.50.norm.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.51.mixer.experts.down_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.51.mixer.experts.up_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.51.mixer.gate.e_score_correction_bias": "model-00014-of-00016.safetensors",
+    "backbone.layers.51.mixer.gate.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.51.mixer.shared_experts.down_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.51.mixer.shared_experts.up_proj.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.51.norm.weight": "model-00014-of-00016.safetensors",
+    "backbone.layers.6.mixer.experts.down_proj.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.6.mixer.experts.up_proj.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.6.mixer.gate.e_score_correction_bias": "model-00015-of-00016.safetensors",
+    "backbone.layers.6.mixer.gate.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.6.mixer.shared_experts.down_proj.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.6.mixer.shared_experts.up_proj.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.6.norm.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.A_log": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.D": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.conv1d.bias": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.conv1d.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.dt_bias": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.in_proj.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.norm.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.mixer.out_proj.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.7.norm.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.8.mixer.experts.down_proj.weight": "model-00015-of-00016.safetensors",
+    "backbone.layers.8.mixer.experts.up_proj.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.8.mixer.gate.e_score_correction_bias": "model-00016-of-00016.safetensors",
+    "backbone.layers.8.mixer.gate.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.8.mixer.shared_experts.down_proj.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.8.mixer.shared_experts.up_proj.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.8.norm.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.A_log": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.D": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.conv1d.bias": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.conv1d.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.dt_bias": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.in_proj.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.norm.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.mixer.out_proj.weight": "model-00016-of-00016.safetensors",
+    "backbone.layers.9.norm.weight": "model-00016-of-00016.safetensors",
+    "backbone.norm_f.weight": "model-00016-of-00016.safetensors",
+    "lm_head.weight": "model-00016-of-00016.safetensors"
+  }
+}

modeling_superlinear_exp.py ADDED Viewed

The diff for this file is too large to render. See raw diff

moe.py ADDED Viewed

	@@ -0,0 +1,890 @@

+from __future__ import annotations
+import functools
+import math
+import os
+from typing import Any
+import torch
+import torch.nn.functional as F
+try:  # pragma: no cover
+    import triton
+    import triton.language as tl
+except Exception:  # pragma: no cover
+    triton = None
+    tl = None
+# Eagerly import vllm._moe_C to ensure fused MoE kernels are available
+# This must happen before shared_fused_moe_is_available() is called
+try:  # pragma: no cover
+    import vllm._moe_C  # noqa: F401
+except Exception:  # pragma: no cover
+    pass
+def _cdiv(a: int, b: int) -> int:
+    return (a + b - 1) // b
+def _round_up(a: int, b: int) -> int:
+    return _cdiv(a, b) * b
+@functools.lru_cache(maxsize=1)
+def _ensure_moe_kernels_loaded() -> bool:
+    if hasattr(torch.ops, "_moe_C"):
+        return True
+    try:  # pragma: no cover
+        # vLLM installs the MoE kernels under this module name.
+        import vllm._moe_C  # noqa: F401
+        return hasattr(torch.ops, "_moe_C")
+    except Exception:
+        return False
+@functools.lru_cache(maxsize=1)
+def shared_fused_moe_is_available() -> bool:
+    if triton is None or tl is None:
+        return False
+    if not _ensure_moe_kernels_loaded():
+        return False
+    return all(
+        hasattr(torch.ops._moe_C, attr)
+        for attr in ("moe_align_block_size", "moe_sum")
+    )
+def _moe_align_block_size(
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    *,
+    pad_sorted_ids: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Compute (sorted_token_ids, expert_ids, num_tokens_post_padded) using vLLM's `_moe_C` kernels."""
+    if not shared_fused_moe_is_available():
+        raise RuntimeError(
+            "Shared fused MoE is not available (missing triton and/or vllm._moe_C)."
+        )
+    if topk_ids.dtype != torch.int32:
+        topk_ids = topk_ids.to(torch.int32)
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = _round_up(max_num_tokens_padded, block_size)
+    if topk_ids.numel() < num_experts:
+        max_num_tokens_padded = min(topk_ids.numel() * block_size, max_num_tokens_padded)
+    sorted_token_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    max_num_m_blocks = _cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device=topk_ids.device)
+    torch.ops._moe_C.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        None,  # maybe_expert_map (added in newer vllm versions)
+    )
+    return sorted_token_ids, expert_ids, num_tokens_post_pad
+def _get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+) -> dict[str, int]:
+    # Heuristic default configs adapted from vLLM.
+    if M <= E:
+        return {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 1,
+        }
+    return {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+    }
+if triton is not None and tl is not None:
+    @triton.jit
+    def _write_zeros_to_output(
+        c_ptr,
+        stride_cm,
+        stride_cn,
+        pid_n,
+        N,
+        offs_token,
+        token_mask,
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        compute_type: tl.constexpr,
+    ):
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type)
+        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+        c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+        tl.store(c_ptrs, accumulator, mask=c_mask)
+    @triton.jit
+    def _fused_moe_kernel(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        topk_weights_ptr,
+        sorted_token_ids_ptr,
+        expert_ids_ptr,
+        num_tokens_post_padded_ptr,
+        # Matrix dimensions
+        N,
+        K,
+        EM,
+        num_valid_tokens,
+        # Strides
+        stride_am,
+        stride_ak,
+        stride_be,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        MUL_ROUTED_WEIGHT: tl.constexpr,
+        top_k: tl.constexpr,
+        compute_type: tl.constexpr,
+    ):
+        # Grouped ordering to promote L2 data reuse.
+        pid = tl.program_id(axis=0)
+        num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+        num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+        if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+            return
+        offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+        offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+        token_mask = offs_token < num_valid_tokens
+        off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+        if off_experts == -1:
+            _write_zeros_to_output(
+                c_ptr,
+                stride_cm,
+                stride_cn,
+                pid_n,
+                N,
+                offs_token,
+                token_mask,
+                BLOCK_SIZE_M,
+                BLOCK_SIZE_N,
+                compute_type,
+            )
+            return
+        offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+        offs_k = tl.arange(0, BLOCK_SIZE_K)
+        a_ptrs = a_ptr + (
+            offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+        )
+        b_ptrs = (
+            b_ptr
+            + off_experts * stride_be
+            + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+        )
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                other=0.0,
+            )
+            b = tl.load(
+                b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0
+            )
+            # Disable TF32 for numerical precision (matches PyTorch's default behavior)
+            accumulator += tl.dot(a, b, allow_tf32=False)
+            a_ptrs += BLOCK_SIZE_K * stride_ak
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+        if MUL_ROUTED_WEIGHT:
+            moe_weight = tl.load(
+                topk_weights_ptr + offs_token, mask=token_mask, other=0
+            )
+            accumulator = accumulator * moe_weight[:, None]
+        accumulator = accumulator.to(compute_type)
+        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+        c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+        tl.store(c_ptrs, accumulator, mask=c_mask)
+else:  # pragma: no cover
+    def _fused_moe_kernel(*args, **kwargs):  # type: ignore[no-redef]
+        raise RuntimeError("Triton is not available; cannot use fused MoE.")
+def _invoke_fused_moe_kernel(
+    *,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    topk_weights: torch.Tensor | None,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: dict[str, Any],
+    compute_type: Any,
+) -> None:
+    assert triton is not None and tl is not None
+    assert topk_weights is not None or not mul_routed_weight
+    assert sorted_token_ids.stride(0) == 1
+    M = A.size(0)
+    num_tokens = M * top_k
+    EM = sorted_token_ids.size(0)
+    grid = lambda META: (
+        triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]),
+    )
+    config = dict(config)
+    _fused_moe_kernel[grid](
+        A,
+        B,
+        C,
+        topk_weights,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        B.size(1),
+        B.size(2),
+        EM,
+        num_tokens,
+        A.stride(0),
+        A.stride(1),
+        B.stride(0),
+        B.stride(2),
+        B.stride(1),
+        C.stride(1),
+        C.stride(2),
+        MUL_ROUTED_WEIGHT=mul_routed_weight,
+        top_k=top_k,
+        compute_type=compute_type,
+        **config,
+    )
+def fused_experts_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    *,
+    activation: str,
+    inplace: bool = False,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    """
+    Fused MoE expert compute (2-layer MLP) using Triton grouped GEMMs + vLLM's `_moe_C` align/sum kernels.
+    This function is intentionally minimal: it supports *non-gated* activations
+    (`*_no_mul`), which is the SuperlinearExp MoE case here.
+    """
+    if torch.is_grad_enabled() and any(
+        t.requires_grad for t in (hidden_states, w1, w2, topk_weights)
+    ):
+        return _FusedExpertsMoE.apply(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation,
+            apply_router_weight_on_input,
+        )
+    if hidden_states.numel() == 0:
+        return hidden_states
+    if hidden_states.dim() != 2:
+        raise ValueError(f"Expected [tokens, hidden], got {tuple(hidden_states.shape)}")
+    return _fused_experts_moe_forward(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        activation=activation,
+        inplace=inplace,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
+def _activation_forward(x: torch.Tensor, activation: str) -> torch.Tensor:
+    if activation == "relu2_no_mul":
+        return torch.square(F.relu(x))
+    if activation == "silu_no_mul":
+        return F.silu(x)
+    if activation == "gelu_no_mul":
+        return F.gelu(x)
+    raise ValueError(f"Unsupported fused MoE activation: {activation}")
+def _activation_backward(x_fp32: torch.Tensor, activation: str) -> torch.Tensor:
+    if activation == "relu2_no_mul":
+        return (x_fp32 > 0).to(x_fp32.dtype) * (2.0 * x_fp32)
+    if activation == "silu_no_mul":
+        sig = torch.sigmoid(x_fp32)
+        return sig * (1.0 + x_fp32 * (1.0 - sig))
+    if activation == "gelu_no_mul":
+        inv_sqrt2 = 1.0 / math.sqrt(2.0)
+        inv_sqrt2pi = 1.0 / math.sqrt(2.0 * math.pi)
+        cdf = 0.5 * (1.0 + torch.erf(x_fp32 * inv_sqrt2))
+        pdf = torch.exp(-0.5 * x_fp32 * x_fp32) * inv_sqrt2pi
+        return cdf + x_fp32 * pdf
+    raise ValueError(f"Unsupported fused MoE activation: {activation}")
+def _eager_experts_moe_forward(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    *,
+    activation: str,
+    apply_router_weight_on_input: bool,
+) -> torch.Tensor:
+    if hidden_states.numel() == 0:
+        return hidden_states
+    if hidden_states.dim() != 2:
+        raise ValueError(f"Expected [tokens, hidden], got {tuple(hidden_states.shape)}")
+    num_tokens, hidden_size = hidden_states.shape
+    num_experts, intermediate_size, hidden_size_w1 = w1.shape
+    num_experts_w2, hidden_size_w2, intermediate_size_w2 = w2.shape
+    if hidden_size_w1 != hidden_size:
+        raise ValueError(f"Hidden size mismatch: {hidden_size} != {hidden_size_w1} (w1 in_features)")
+    if num_experts_w2 != num_experts:
+        raise ValueError(f"Expert count mismatch: {num_experts} != {num_experts_w2} (w2)")
+    if hidden_size_w2 != hidden_size:
+        raise ValueError(f"Hidden size mismatch: {hidden_size} != {hidden_size_w2} (w2 out_features)")
+    if intermediate_size_w2 != intermediate_size:
+        raise ValueError(f"Intermediate size mismatch: {intermediate_size} != {intermediate_size_w2} (w2 in_features)")
+    if topk_ids.shape != topk_weights.shape:
+        raise ValueError("topk_ids/topk_weights shape mismatch")
+    topk = topk_ids.size(1)
+    out = torch.zeros((num_tokens, hidden_size), device=hidden_states.device, dtype=torch.float32)
+    CHUNK_SIZE = int(os.getenv("FUSED_MOE_CHUNK_SIZE", str(16 * 1024)))
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin = chunk * CHUNK_SIZE
+        end = min((chunk + 1) * CHUNK_SIZE, num_tokens)
+        x = hidden_states[begin:end]
+        if x.numel() == 0:
+            break
+        m = x.size(0)
+        topk_ids_chunk = topk_ids[begin:end].reshape(-1).to(torch.long)
+        topk_weights_chunk = topk_weights[begin:end].reshape(-1).to(torch.float32)
+        token_ids = (
+            torch.arange(m, device=x.device, dtype=torch.long)
+            .repeat_interleave(topk)
+        )
+        k_ids = torch.arange(topk, device=x.device, dtype=torch.long).repeat(m)
+        sort_order = torch.argsort(topk_ids_chunk)
+        expert_ids_sorted = topk_ids_chunk[sort_order]
+        token_ids_sorted = token_ids[sort_order]
+        k_ids_sorted = k_ids[sort_order]
+        weights_sorted = topk_weights_chunk[sort_order]
+        unique_experts, counts = torch.unique_consecutive(expert_ids_sorted, return_counts=True)
+        out_chunk = torch.zeros((m, hidden_size), device=x.device, dtype=torch.float32)
+        offset = 0
+        for expert_idx, count in zip(unique_experts.tolist(), counts.tolist()):
+            tokens = token_ids_sorted[offset : offset + count]
+            weights = weights_sorted[offset : offset + count]
+            offset += count
+            if count == 0:
+                continue
+            x_e = x.index_select(0, tokens)
+            u0 = F.linear(x_e, w1[expert_idx])
+            if apply_router_weight_on_input:
+                u = u0 * weights.to(u0.dtype).unsqueeze(-1)
+            else:
+                u = u0
+            a = _activation_forward(u, activation)
+            v = F.linear(a, w2[expert_idx])
+            if not apply_router_weight_on_input:
+                v = v * weights.to(v.dtype).unsqueeze(-1)
+            out_chunk.index_add_(0, tokens, v.to(torch.float32))
+        out[begin:end] = out_chunk
+    return out.to(hidden_states.dtype)
+def _fused_experts_moe_forward(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    *,
+    activation: str,
+    inplace: bool = False,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    if not shared_fused_moe_is_available():
+        return _eager_experts_moe_forward(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+    if not hidden_states.is_cuda:
+        return _eager_experts_moe_forward(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+    # Constraints similar to vLLM's fused kernels.
+    if not hidden_states.is_contiguous():
+        hidden_states = hidden_states.contiguous()
+    if w1.stride(-1) != 1 or w2.stride(-1) != 1:
+        raise ValueError("Expert weights must be contiguous in the last dimension.")
+    # Shapes.
+    num_tokens = hidden_states.size(0)
+    num_experts, n1, k1 = w1.size()
+    _, k2, n2 = w2.size()
+    if hidden_states.size(1) != k1:
+        raise ValueError(
+            f"Hidden size mismatch: {hidden_states.size(1)} != {k1} (w1 in_features)"
+        )
+    if n2 != n1:
+        raise ValueError(f"Intermediate size mismatch: {n2} != {n1}")
+    if topk_ids.shape != topk_weights.shape:
+        raise ValueError("topk_ids/topk_weights shape mismatch")
+    topk = topk_ids.size(1)
+    CHUNK_SIZE = int(os.getenv("FUSED_MOE_CHUNK_SIZE", str(16 * 1024)))
+    M = min(num_tokens, CHUNK_SIZE)
+    config = _get_default_config(M=M, E=num_experts, N=n1, K=k1, topk=topk)
+    if hidden_states.dtype == torch.bfloat16:
+        compute_type = tl.bfloat16
+    elif hidden_states.dtype == torch.float16:
+        compute_type = tl.float16
+    elif hidden_states.dtype == torch.float32:
+        compute_type = tl.float32
+    else:
+        raise ValueError(f"Unsupported dtype: {hidden_states.dtype}")
+    # Accumulate in float32 for numerical precision (matches eager path behavior).
+    # The output will be converted back to the original dtype at the end.
+    original_dtype = hidden_states.dtype
+    out = torch.zeros(
+        (num_tokens, hidden_states.size(1)),
+        device=hidden_states.device,
+        dtype=torch.float32,
+    )
+    # Cache buffers sized to the largest chunk.
+    # IMPORTANT: up_out and down_out must NOT overlap in memory!
+    # The down projection kernel reads from up_out while writing to down_out.
+    # If they share memory, the kernel will corrupt its input as it writes output.
+    up_out = torch.empty(
+        (M, topk, n1), device=hidden_states.device, dtype=hidden_states.dtype
+    )
+    down_out = torch.empty(
+        (M, topk, k2), device=hidden_states.device, dtype=hidden_states.dtype
+    )
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin = chunk * CHUNK_SIZE
+        end = min((chunk + 1) * CHUNK_SIZE, num_tokens)
+        curr_hidden = hidden_states[begin:end]
+        tokens_in_chunk = curr_hidden.size(0)
+        if tokens_in_chunk == 0:
+            break
+        if tokens_in_chunk != M:
+            up_out = up_out[:tokens_in_chunk]
+            down_out = down_out[:tokens_in_chunk]
+            config = _get_default_config(M=tokens_in_chunk, E=num_experts, N=n1, K=k1, topk=topk)
+        curr_topk_ids = topk_ids[begin:end].to(torch.int32).contiguous()
+        curr_topk_weights = topk_weights[begin:end].to(torch.float32).contiguous()
+        sorted_token_ids, expert_ids, num_tokens_post_padded = _moe_align_block_size(
+            curr_topk_ids,
+            config["BLOCK_SIZE_M"],
+            num_experts,
+        )
+        # 1) Up projection: [tokens, hidden] -> [tokens * topk, intermediate]
+        _invoke_fused_moe_kernel(
+            A=curr_hidden,
+            B=w1,
+            C=up_out,
+            topk_weights=curr_topk_weights if apply_router_weight_on_input else None,
+            sorted_token_ids=sorted_token_ids,
+            expert_ids=expert_ids,
+            num_tokens_post_padded=num_tokens_post_padded,
+            mul_routed_weight=apply_router_weight_on_input,
+            top_k=topk,
+            config=config,
+            compute_type=compute_type,
+        )
+        # 2) Activation (in-place on up_out to avoid allocating an extra buffer).
+        if activation == "relu2_no_mul":
+            x = up_out.view(-1, n1)
+            x.relu_()
+            x.square_()
+        elif activation == "silu_no_mul":
+            x = up_out.view(-1, n1)
+            x.copy_(F.silu(x))
+        elif activation == "gelu_no_mul":
+            x = up_out.view(-1, n1)
+            x.copy_(F.gelu(x))
+        else:
+            raise ValueError(f"Unsupported fused MoE activation: {activation}")
+        # 3) Down projection: [tokens * topk, intermediate] -> [tokens * topk, hidden]
+        _invoke_fused_moe_kernel(
+            A=up_out.view(-1, n1),
+            B=w2,
+            C=down_out,
+            topk_weights=None if apply_router_weight_on_input else curr_topk_weights,
+            sorted_token_ids=sorted_token_ids,
+            expert_ids=expert_ids,
+            num_tokens_post_padded=num_tokens_post_padded,
+            mul_routed_weight=not apply_router_weight_on_input,
+            top_k=1,
+            config=config,
+            compute_type=compute_type,
+        )
+        # Convert down_out to float32 to match out's dtype for moe_sum
+        torch.ops._moe_C.moe_sum(
+            down_out.view(*down_out.size()).to(torch.float32),
+            out[begin:end],
+        )
+    # Convert back to original dtype after accumulation in float32
+    return out.to(original_dtype)
+class _FusedExpertsMoE(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
+        ctx.activation = activation
+        ctx.apply_router_weight_on_input = apply_router_weight_on_input
+        ctx.save_for_backward(hidden_states, w1, w2, topk_weights, topk_ids)
+        return _fused_experts_moe_forward(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            activation=activation,
+            inplace=False,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor):
+        (
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+        ) = ctx.saved_tensors
+        activation: str = ctx.activation
+        apply_router_weight_on_input: bool = ctx.apply_router_weight_on_input
+        need_hidden, need_w1, need_w2, need_topk_w = ctx.needs_input_grad[:4]
+        grad_hidden = torch.zeros_like(hidden_states) if need_hidden else None
+        grad_w1 = torch.zeros_like(w1) if need_w1 else None
+        grad_w2 = torch.zeros_like(w2) if need_w2 else None
+        grad_topk_weights = torch.zeros_like(topk_weights) if need_topk_w else None
+        if hidden_states.numel() == 0:
+            return grad_hidden, grad_w1, grad_w2, grad_topk_weights, None, None, None
+        num_tokens = hidden_states.size(0)
+        topk = topk_ids.size(1)
+        num_experts = w1.size(0)
+        CHUNK_SIZE = int(os.getenv("FUSED_MOE_CHUNK_SIZE", str(16 * 1024)))
+        max_padded_tokens_per_expert = int(
+            os.getenv("FUSED_MOE_BACKWARD_MAX_PADDED_TOKENS_PER_EXPERT", "2048")
+        )
+        for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+            begin = chunk * CHUNK_SIZE
+            end = min((chunk + 1) * CHUNK_SIZE, num_tokens)
+            x = hidden_states[begin:end]
+            if x.numel() == 0:
+                break
+            m = x.size(0)
+            token_ids = (
+                torch.arange(m, device=x.device, dtype=torch.long)
+                .repeat_interleave(topk)
+            )
+            k_ids = torch.arange(topk, device=x.device, dtype=torch.long).repeat(m)
+            expert_ids = topk_ids[begin:end].reshape(-1).to(torch.long)
+            weights_fp32 = topk_weights[begin:end].reshape(-1).to(torch.float32)
+            sort_order = torch.argsort(expert_ids)
+            expert_ids_sorted = expert_ids[sort_order]
+            token_ids_sorted = token_ids[sort_order]
+            k_ids_sorted = k_ids[sort_order]
+            weights_sorted = weights_fp32[sort_order]
+            counts_per_expert = torch.bincount(expert_ids_sorted, minlength=num_experts)
+            max_count = int(counts_per_expert.max().item())
+            use_vectorized = (
+                max_count > 0 and max_count <= max_padded_tokens_per_expert
+            )
+            if use_vectorized:
+                hidden_size = x.size(1)
+                offsets = torch.cumsum(counts_per_expert, 0) - counts_per_expert
+                pos_in_expert = torch.arange(
+                    expert_ids_sorted.numel(), device=x.device, dtype=torch.long
+                ) - offsets[expert_ids_sorted]
+                flat = expert_ids_sorted * max_count + pos_in_expert
+                x_pad = torch.zeros(
+                    (num_experts, max_count, hidden_size),
+                    device=x.device,
+                    dtype=x.dtype,
+                )
+                x_pad.view(num_experts * max_count, hidden_size)[flat] = x[
+                    token_ids_sorted
+                ]
+                gy_pad = torch.zeros(
+                    (num_experts, max_count, hidden_size),
+                    device=x.device,
+                    dtype=torch.float32,
+                )
+                gy_pad.view(num_experts * max_count, hidden_size)[flat] = grad_out[
+                    begin:end
+                ][token_ids_sorted].to(torch.float32)
+                w_pad = torch.zeros(
+                    (num_experts, max_count),
+                    device=x.device,
+                    dtype=torch.float32,
+                )
+                w_pad.view(num_experts * max_count)[flat] = weights_sorted
+                u0 = torch.einsum("emh,eih->emi", x_pad, w1)
+                if apply_router_weight_on_input:
+                    u = u0 * w_pad.to(u0.dtype).unsqueeze(-1)
+                else:
+                    u = u0
+                a = _activation_forward(u, activation)
+                tmp = torch.einsum("emh,ehi->emi", gy_pad.to(a.dtype), w2)
+                tmp_fp32 = tmp.to(torch.float32)
+                if need_w2:
+                    if apply_router_weight_on_input:
+                        grad_v = gy_pad.to(a.dtype)
+                    else:
+                        grad_v = (gy_pad * w_pad.unsqueeze(-1)).to(a.dtype)
+                    grad_w2_chunk = torch.einsum("emh,emi->ehi", grad_v, a)
+                    assert grad_w2 is not None
+                    grad_w2.add_(grad_w2_chunk.to(grad_w2.dtype))
+                gA_fp32 = tmp_fp32
+                if not apply_router_weight_on_input:
+                    gA_fp32 = gA_fp32 * w_pad.unsqueeze(-1)
+                du_fp32 = _activation_backward(u.to(torch.float32), activation)
+                gU_fp32 = gA_fp32 * du_fp32
+                if apply_router_weight_on_input:
+                    if need_topk_w:
+                        grad_w_fp32 = torch.sum(
+                            gU_fp32 * u0.to(torch.float32),
+                            dim=-1,
+                        )
+                    gU0_fp32 = gU_fp32 * w_pad.unsqueeze(-1)
+                else:
+                    if need_topk_w:
+                        grad_w_fp32 = torch.sum(
+                            a.to(torch.float32) * tmp_fp32,
+                            dim=-1,
+                        )
+                    gU0_fp32 = gU_fp32
+                gU0 = gU0_fp32.to(x.dtype)
+                if need_w1:
+                    grad_w1_chunk = torch.einsum("emi,emh->eih", gU0, x_pad)
+                    assert grad_w1 is not None
+                    grad_w1.add_(grad_w1_chunk.to(grad_w1.dtype))
+                if need_hidden:
+                    grad_x_pad = torch.einsum("emi,eih->emh", gU0, w1)
+                    grad_x_assign = grad_x_pad.view(
+                        num_experts * max_count, hidden_size
+                    )[flat]
+                    grad_x_chunk = torch.zeros(
+                        (m, hidden_size), device=x.device, dtype=x.dtype
+                    )
+                    grad_x_chunk.index_add_(0, token_ids_sorted, grad_x_assign)
+                    assert grad_hidden is not None
+                    grad_hidden[begin:end].copy_(grad_x_chunk)
+                if need_topk_w:
+                    grad_w_assign = grad_w_fp32.view(num_experts * max_count)[flat]
+                    grad_topk_chunk = torch.zeros(
+                        (m, topk), device=x.device, dtype=topk_weights.dtype
+                    )
+                    grad_topk_chunk[token_ids_sorted, k_ids_sorted] = grad_w_assign.to(
+                        grad_topk_chunk.dtype
+                    )
+                    assert grad_topk_weights is not None
+                    grad_topk_weights[begin:end].copy_(grad_topk_chunk)
+                continue
+            unique_experts, counts = torch.unique_consecutive(
+                expert_ids_sorted, return_counts=True
+            )
+            grad_x_chunk = torch.zeros((m, x.size(1)), device=x.device, dtype=x.dtype) if need_hidden else None
+            grad_topk_chunk = torch.zeros((m, topk), device=x.device, dtype=topk_weights.dtype) if need_topk_w else None
+            offset = 0
+            for expert_idx, count in zip(unique_experts.tolist(), counts.tolist()):
+                tokens = token_ids_sorted[offset : offset + count]
+                ks = k_ids_sorted[offset : offset + count]
+                w = weights_sorted[offset : offset + count]
+                offset += count
+                if count == 0:
+                    continue
+                x_e = x.index_select(0, tokens)
+                w1_e = w1[expert_idx]
+                w2_e = w2[expert_idx]
+                u0 = F.linear(x_e, w1_e)
+                if apply_router_weight_on_input:
+                    u = u0 * w.to(u0.dtype).unsqueeze(-1)
+                else:
+                    u = u0
+                a = _activation_forward(u, activation)
+                grad_y_fp32 = grad_out[begin:end].index_select(0, tokens).to(torch.float32)
+                if apply_router_weight_on_input:
+                    grad_v_fp32 = grad_y_fp32
+                else:
+                    grad_v_fp32 = grad_y_fp32 * w.unsqueeze(-1)
+                grad_v = grad_v_fp32.to(a.dtype)
+                if need_w2:
+                    grad_w2_e = torch.matmul(grad_v.transpose(0, 1), a)
+                    assert grad_w2 is not None
+                    grad_w2[expert_idx].add_(grad_w2_e.to(grad_w2.dtype))
+                gA = torch.matmul(grad_v, w2_e)
+                du_fp32 = _activation_backward(u.to(torch.float32), activation)
+                gU_fp32 = gA.to(torch.float32) * du_fp32
+                if apply_router_weight_on_input:
+                    grad_w_fp32 = torch.sum(gU_fp32 * u0.to(torch.float32), dim=-1)
+                    gU0_fp32 = gU_fp32 * w.unsqueeze(-1)
+                else:
+                    gy_for_w = grad_y_fp32.to(a.dtype)
+                    tmp = torch.matmul(gy_for_w, w2_e).to(torch.float32)
+                    grad_w_fp32 = torch.sum(a.to(torch.float32) * tmp, dim=-1)
+                    gU0_fp32 = gU_fp32
+                if need_topk_w:
+                    assert grad_topk_chunk is not None
+                    grad_topk_chunk[tokens, ks] = grad_w_fp32.to(grad_topk_chunk.dtype)
+                gU0 = gU0_fp32.to(x_e.dtype)
+                if need_w1:
+                    grad_w1_e = torch.matmul(gU0.transpose(0, 1), x_e)
+                    assert grad_w1 is not None
+                    grad_w1[expert_idx].add_(grad_w1_e.to(grad_w1.dtype))
+                if need_hidden:
+                    assert grad_x_chunk is not None
+                    grad_x_e = torch.matmul(gU0, w1_e)
+                    grad_x_chunk.index_add_(0, tokens, grad_x_e)
+            if need_hidden:
+                assert grad_hidden is not None and grad_x_chunk is not None
+                grad_hidden[begin:end].copy_(grad_x_chunk)
+            if need_topk_w:
+                assert grad_topk_weights is not None and grad_topk_chunk is not None
+                grad_topk_weights[begin:end].copy_(grad_topk_chunk)
+        return grad_hidden, grad_w1, grad_w2, grad_topk_weights, None, None, None

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:623c34567aebb18582765289fbe23d901c62704d6518d71866e0e58db892b5b7
+size 17077484

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff