Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitattributes +1 -0
LICENSE +72 -0
LLAMA_3.1_LICENSE.txt +48 -0
README.md +61 -0
chat_template.jinja +5 -0
config.json +58 -0
configuration_qwerky_llama_mamba_hybrid.py +142 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +390 -0
modeling_qwerky_llama_mamba_hybrid.py +768 -0
special_tokens_map.json +23 -0
tokenizer.json +3 -0
tokenizer_config.json +2064 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,72 @@

+QWERKY AI DISTILLED MODEL LICENSE AGREEMENT
+This model is a distilled version created by QWERKY AI, Inc. and is subject to dual attribution requirements.
+================================================================================
+ATTRIBUTION REQUIREMENTS
+================================================================================
+This model is:
+1. Derived from Meta's Llama 3.1 model and subject to the Llama 3.1 Community License Agreement
+2. Distilled and optimized by QWERKY AI, Inc.
+When using or redistributing this model, you must provide attribution to BOTH:
+- Meta Platforms, Inc. for the original Llama 3.1 model
+- QWERKY AI, Inc. for the distillation and optimization
+Suggested attribution format:
+"This model is based on Meta's Llama 3.1, distilled and optimized by QWERKY AI, Inc."
+================================================================================
+ORIGINAL LLAMA 3.1 LICENSE TERMS
+================================================================================
+This model inherits all terms and conditions from the Llama 3.1 Community License Agreement dated July 23, 2024, including but not limited to:
+1. USAGE RESTRICTIONS: If you have more than 700 million monthly active users, you must request a license from Meta.
+2. PROHIBITED USES: You may not use this model to:
+   - Violate laws or regulations
+   - Engage in harmful, abusive, or discriminatory activities
+   - Generate misinformation or harmful content
+3. DISTRIBUTION: Any redistribution must include:
+   - This complete license
+   - Attribution to both Meta and QWERKY AI
+   - The same use restrictions
+The full Llama 3.1 Community License Agreement is incorporated by reference and included in the LLAMA_3.1_LICENSE.txt file in this repository. It is also available at: https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/LICENSE
+================================================================================
+QWERKY AI ADDITIONAL TERMS
+================================================================================
+In addition to the Llama 3.1 license terms, users must:
+1. ATTRIBUTION: Include clear attribution to QWERKY AI, Inc. in any:
+   - Academic papers or research
+   - Commercial products or services
+   - Public demonstrations or benchmarks
+   - Derivative works or fine-tuned versions
+2. QWERKY BRANDING: Do not imply endorsement by QWERKY AI without written permission
+3. PERFORMANCE CLAIMS: When citing performance metrics, clearly indicate:
+   - That this is a distilled version
+   - Any benchmarks are specific to this distilled model
+   - QWERKY AI's optimization techniques were applied
+================================================================================
+WARRANTY DISCLAIMER
+================================================================================
+THIS MODEL IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED.
+NEITHER META PLATFORMS, INC. NOR QWERKY AI, INC. MAKE ANY WARRANTIES REGARDING
+THE MODEL'S PERFORMANCE, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE.
+================================================================================
+By using this model, you agree to all terms above.
+Copyright (c) Meta Platforms, Inc. (Original Llama 3.1 Model)
+Copyright (c) QWERKY AI, Inc. (Distillation and Optimization)

LLAMA_3.1_LICENSE.txt ADDED Viewed

	@@ -0,0 +1,48 @@

+ LLAMA 3.1 COMMUNITY LICENSE AGREEMENT
+ Llama 3.1 Version Release Date: July 23, 2024
+“Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein.
+“Documentation” means the specifications, manuals and documentation accompanying Llama 3.1 distributed by Meta at https://llama.meta.com/doc/overview.
+“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+“Llama 3.1” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at https://llama.meta.com/llama-downloads.
+“Llama Materials” means, collectively, Meta’s proprietary Llama 3.1 and Documentation (and any portion thereof) made available under this Agreement.
+“Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+    a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials.
+    b. Redistribution and Use.
+          i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service (including another AI model) that contains any of them, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Llama” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials or any outputs or results of the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama” at the beginning of any such AI model name.
+          ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
+          iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Llama 3.1 is licensed under the Llama 3.1 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
+          iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3_1/use-policy), which is hereby incorporated by reference into this Agreement.
+2. Additional Commercial Terms. If, on the Llama 3.1 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+5. Intellectual Property.
+    a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/). All goodwill arising out of your use of the Mark will inure to the benefit of Meta.
+    b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
+    c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Llama 3.1 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials.
+6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+license: other
+tags:
+- qwerky
+- mamba
+- hybrid
+- causal-lm
+- text-generation
+language:
+- en
+library_name: transformers
+pipeline_tag: text-generation
+---
+# QwerkyLlamaMambaHybrid
+Hybrid Mamba-Transformer model from Qwerky AI.
+## Requirements
+- CUDA-compatible GPU
+- Python 3.8+
+- PyTorch 2.0+
+- transformers, safetensors, mamba-ssm, causal-conv1d, flash-attn
+## Installation
+```bash
+pip install transformers torch safetensors
+pip install flash-attn mamba-ssm causal-conv1d --no-build-isolation
+```
+## Usage
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("QwerkyAI/Qwick-8B-Instruct")
+model = AutoModelForCausalLM.from_pretrained(
+    "QwerkyAI/Qwick-8B-Instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=50)
+print(tokenizer.decode(outputs[0]))
+```
+## Model Files
+- `config.json` - Model configuration with `auto_map`
+- `modeling_qwerky_llama_mamba_hybrid.py` - Custom modeling class
+- `configuration_qwerky_llama_mamba_hybrid.py` - Custom configuration class
+- `model.safetensors` - Model weights
+## License
+See LICENSE file for details.

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,5 @@

+{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
+' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "architectures": [
+    "QwerkyLlamaMambaHybridForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "dtype": "bfloat16",
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "mamba_version": "Mamba1",
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "qwerky_llama_mamba_hybrid",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.6",
+  "use_cache": true,
+  "vocab_size": 128256,
+  "auto_map": {
+    "AutoConfig": "configuration_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridConfig",
+    "AutoModelForCausalLM": "modeling_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridForCausalLM"
+  },
+  "d_model": 4096,
+  "d_inner": 4096,
+  "d_xb": 1024,
+  "ssm_cfg": {
+    "expand": 1
+  },
+  "attn_layers": [
+    3,
+    8,
+    13,
+    18,
+    23,
+    27
+  ]
+}

configuration_qwerky_llama_mamba_hybrid.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) 2025, Qwerky AI, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""QwerkyLlamaMambaHybrid model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class QwerkyLlamaMambaHybridConfig(PretrainedConfig):
+    r"""
+    Configuration class for QwerkyLlamaMambaHybrid model. Consolidates transformer and mamba configs.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the model.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            Number of key-value heads for grouped query attention.
+        hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the MLP layers.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation for weight initialization.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to return the last key/values attentions.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output word embeddings.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            RoPE scaling configuration.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout ratio for attention probabilities.
+        d_model (`int`, *optional*):
+            Model dimension for Mamba layers. Defaults to `hidden_size`.
+        d_inner (`int`, *optional*):
+            Inner dimension for Mamba layers. Defaults to `intermediate_size`.
+        d_xb (`int`, *optional*, defaults to 2560):
+            Dimension for Mamba xB projection.
+        ssm_cfg (`dict`, *optional*, defaults to `{}`):
+            State space model configuration dictionary.
+        attn_layers (`List[int]`, *optional*, defaults to `[]`):
+            List of layer indices that use attention instead of Mamba.
+    """
+    model_type = "qwerky_llama_mamba_hybrid"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        hidden_size: int = 4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int | None = None,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 2048,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 10000.0,
+        rope_scaling: dict | None = None,
+        attention_dropout: float = 0.0,
+        d_model: int | None = None,
+        d_inner: int | None = None,
+        d_xb: int = 2560,
+        ssm_cfg: dict | None = None,
+        attn_layers: list[int] | None = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = (
+            num_key_value_heads
+            if num_key_value_heads is not None
+            else num_attention_heads
+        )
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Mamba-specific parameters
+        self.d_model = d_model if d_model is not None else hidden_size
+        self.d_inner = d_inner if d_inner is not None else intermediate_size
+        self.d_xb = d_xb
+        self.ssm_cfg = ssm_cfg if ssm_cfg is not None else {}
+        self.attn_layers = attn_layers if attn_layers is not None else []
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        # Set auto_map for external code loading
+        if "auto_map" not in kwargs:
+            self.auto_map = {
+                "AutoConfig": "configuration_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridConfig",
+                "AutoModelForCausalLM": "modeling_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridForCausalLM",
+            }
+        # Set architectures field
+        if "architectures" not in kwargs:
+            self.architectures = ["QwerkyLlamaMambaHybridForCausalLM"]

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0289497a76fd7c60c1d67ac6ad5f15292a6bed639b3e40eaae580f578dfe45aa
+size 4889922520

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27ea066e23b4f1a490acc597035c267088ecbea615a66f0cf8358f68b8785a8b
+size 4900631584

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:365b2ecd3837b7a5dda64f03f879f1e7fce707f9e8c35878ea47cb3bf19ed125
+size 4905031752

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01d67026c28c25d7303baa326f7fd19f8a881ed0380152bfdd51c5a2f93bfcdf
+size 2351345248

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,390 @@

+{
+  "metadata": {
+    "total_size": 17046888448
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.A_log": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.D": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.A_log": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.D": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.A_log": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.D": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mha.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mha.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.A_log": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.D": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.A_log": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.D": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.A_log": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.D": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.A_log": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.D": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mha.in_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mha.out_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mha.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mha.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.A_log": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.D": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mha.in_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mha.out_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mha.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mha.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mha.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mha.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.28.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.A_log": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.D": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.29.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.A_log": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.D": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.conv1d.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.conv1d.bias": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.in_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.dt_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.dt_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.30.mamba.out_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.A_log": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.D": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.conv1d.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.conv1d.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.in_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.dt_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.dt_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.31.mamba.out_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "lm_head.weight": "model-00004-of-00004.safetensors"
+  }
+}

modeling_qwerky_llama_mamba_hybrid.py ADDED Viewed

	@@ -0,0 +1,768 @@

+# Copyright (c) 2026, Qwerky AI Inc. All rights reserved.
+#
+# Licensed under the Qwerky Distilled Model License Agreement (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     See the LICENSE file in this repository
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch QwerkyLlamaMambaHybrid model for inference."""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from mamba_ssm.modules.mha import MHA
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+from mamba_ssm.ops.triton.layer_norm import RMSNorm
+from mamba_ssm.utils.generation import GenerationMixin as MambaGenerationMixin
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import CausalLMOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+from .configuration_qwerky_llama_mamba_hybrid import QwerkyLlamaMambaHybridConfig
+logger = logging.get_logger(__name__)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Repeat KV heads to match number of attention heads."""
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Mamba(nn.Module):
+    """Mamba SSM layer implementation."""
+    def __init__(
+        self,
+        d_model,
+        d_inner,
+        d_xb,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        repeat_kv_before_conv=True,
+        conv_bias=True,
+        out_proj_bias=False,
+        use_fast_path=True,
+        layer_idx=None,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_xb = d_xb
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = (
+            d_inner if d_inner is not None else int(self.expand * self.d_model)
+        )
+        self.dt_rank: int = (
+            math.ceil(self.d_model / 16) if dt_rank == "auto" else int(dt_rank)
+        )
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.repeat_kv_before_conv = repeat_kv_before_conv
+        conv_dim = self.d_inner if self.repeat_kv_before_conv else self.d_xb
+        self.conv1d = nn.Conv1d(
+            in_channels=conv_dim,
+            out_channels=conv_dim,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=conv_dim,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.num_xb_head = self.d_xb // self.d_state
+        self.num_C_head = self.d_inner // self.d_state
+        self.repeat_group = self.num_C_head // self.num_xb_head
+        self.in_proj = nn.Linear(
+            self.d_model,
+            2 * self.d_xb + 2 * self.d_inner + self.dt_rank,
+            bias=False,
+            **factory_kwargs,
+        )
+        self.dt_proj = nn.Linear(
+            self.dt_rank, self.d_inner, bias=True, **factory_kwargs
+        )
+        # Initialize dt projection
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        # Initialize dt bias
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs)
+            * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        self.dt_proj.bias._no_reinit = True  # type: ignore[attr-defined]
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True  # type: ignore[attr-defined]
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))
+        self.D._no_weight_decay = True  # type: ignore[attr-defined]
+        self.out_proj = nn.Linear(
+            self.d_inner, self.d_model, bias=out_proj_bias, **factory_kwargs
+        )
+    def forward(self, hidden_states, inference_params=None):
+        batch, seqlen, dim = hidden_states.shape
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        A = -torch.exp(self.A_log.float())
+        if not hidden_states.is_contiguous():
+            hidden_states = hidden_states.contiguous()
+        zxbcdt = self.in_proj(hidden_states)
+        z, x, B, C, dt = torch.split(
+            zxbcdt,
+            [self.d_inner, self.d_xb, self.d_xb, self.d_inner, self.dt_rank],
+            dim=-1,
+        )
+        x = rearrange(x, "b l d -> b d l")
+        z = rearrange(z, "b l d -> b d l")
+        B = rearrange(
+            B, "b l (n_group dstate) -> b n_group l dstate", dstate=self.d_state
+        )
+        B = repeat_kv(B, self.repeat_group)
+        B = rearrange(B, "b n_group l dstate -> b n_group dstate l").contiguous()
+        C = rearrange(
+            C, "b l (n_group dstate) -> b n_group dstate l", dstate=self.d_state
+        ).contiguous()
+        dt = self.dt_proj(dt)
+        dt = rearrange(dt, "b l d -> b d l")
+        if self.repeat_kv_before_conv:
+            x = rearrange(
+                x, "b (n_group dstate) l -> b n_group l dstate", dstate=self.d_state
+            )
+            x = repeat_kv(x, self.repeat_group)
+            x = rearrange(x, "b n_group l dstate -> b (n_group dstate) l")
+        # Conv state update
+        if conv_state is not None:
+            if x.shape[-1] >= self.d_conv:
+                conv_state.copy_(x[:, :, -self.d_conv :])
+            else:
+                conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))
+        if causal_conv1d_fn is None:
+            x = self.act(self.conv1d(x)[..., :seqlen])
+        else:
+            x = causal_conv1d_fn(
+                x=x,
+                weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                bias=self.conv1d.bias,
+                activation=self.activation,
+            )
+        if not self.repeat_kv_before_conv:
+            x = rearrange(
+                x, "b (n_group dstate) l -> b n_group l dstate", dstate=self.d_state
+            )
+            x = repeat_kv(x, self.repeat_group)
+            x = rearrange(x, "b n_group l dstate -> b (n_group dstate) l")
+        return_last_state = ssm_state is not None
+        y = selective_scan_fn(
+            x,
+            dt,
+            A,
+            B,
+            C,
+            self.D.float(),
+            z=z,
+            delta_bias=self.dt_proj.bias.float(),
+            delta_softplus=True,
+            return_last_state=return_last_state,
+        )
+        if return_last_state:
+            y, last_state = y
+            ssm_state.copy_(
+                rearrange(last_state, "b (h d) n -> b h d n", h=self.num_C_head)
+            )
+        y = rearrange(y, "b d l -> b l d")
+        return self.out_proj(y)
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        hidden_states_input = hidden_states.squeeze(1)
+        A = -torch.exp(self.A_log.float())
+        zxbcdt = self.in_proj(hidden_states_input)
+        z, x, B, C, dt = torch.split(
+            zxbcdt,
+            [self.d_inner, self.d_xb, self.d_xb, self.d_inner, self.dt_rank],
+            dim=-1,
+        )
+        B = rearrange(B, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state)
+        B = torch.repeat_interleave(B, dim=1, repeats=self.repeat_group)
+        C = rearrange(
+            C, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state
+        ).contiguous()
+        dt = self.dt_proj(dt)
+        if self.repeat_kv_before_conv:
+            x = rearrange(
+                x, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state
+            )
+            x = torch.repeat_interleave(x, dim=1, repeats=self.repeat_group)
+            x = rearrange(x, "b n_group dstate -> b (n_group dstate)")
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))
+            conv_state[:, :, -1] = x
+            x = torch.sum(
+                conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1
+            )
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+        if not self.repeat_kv_before_conv:
+            x = rearrange(
+                x, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state
+            )
+            x = torch.repeat_interleave(x, dim=1, repeats=self.repeat_group)
+            x = rearrange(x, "b n_group dstate -> b (n_group dstate)")
+        x = rearrange(x, "b (h d) -> b h d", h=self.num_C_head)
+        dt = rearrange(dt, "b (h d) -> b h d", h=self.num_C_head)
+        A = rearrange(A, "(h d) n -> h d n", h=self.num_C_head)
+        D = rearrange(self.D, "(h d) -> h d", h=self.num_C_head)
+        z = rearrange(z, "b (h d) -> b h d", h=self.num_C_head)
+        dt_bias = rearrange(self.dt_proj.bias, "(h d) -> h d", h=self.num_C_head)
+        if selective_state_update is None:
+            raise RuntimeError(
+                "selective_state_update is not available. "
+                "Please install mamba-ssm with CUDA support: "
+                "pip install mamba-ssm causal-conv1d --no-build-isolation"
+            )
+        y = selective_state_update(
+            ssm_state, x, dt, A, B, C, D, z=z, dt_bias=dt_bias, dt_softplus=True
+        )
+        y = rearrange(y, "b h d -> b (h d)")
+        return self.out_proj(y).unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_dim = self.d_inner if self.repeat_kv_before_conv else self.d_xb
+        conv_state = torch.zeros(
+            batch_size, conv_dim, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        ssm_state = torch.zeros(
+            batch_size,
+            self.num_C_head,
+            self.d_inner // self.num_C_head,
+            self.d_state,
+            device=device,
+            dtype=ssm_dtype,
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(
+        self, inference_params, batch_size, initialize_states=False
+    ):
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            conv_state, ssm_state = self.allocate_inference_cache(batch_size, 0)
+            inference_params.key_value_memory_dict[self.layer_idx] = (
+                conv_state,
+                ssm_state,
+            )
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[
+                self.layer_idx
+            ]
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+class MLP(nn.Module):
+    def __init__(self, d_model, intermediate_size, hidden_act, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.gate_proj = nn.Linear(
+            d_model, intermediate_size, bias=False, **factory_kwargs
+        )
+        self.up_proj = nn.Linear(
+            d_model, intermediate_size, bias=False, **factory_kwargs
+        )
+        self.down_proj = nn.Linear(
+            intermediate_size, d_model, bias=False, **factory_kwargs
+        )
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class MHADecoderLayer(nn.Module):
+    def __init__(
+        self, config: QwerkyLlamaMambaHybridConfig, layer_idx: int, device=None, dtype=None
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.mha = MHA(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_heads_kv=config.num_key_value_heads,
+            layer_idx=layer_idx,
+            mlp_dim=0,
+            qkv_proj_bias=False,
+            out_proj_bias=False,
+            rotary_emb_dim=config.hidden_size // config.num_attention_heads,
+            rotary_emb_base=config.rope_theta,
+            causal=True,
+            device=device,
+            dtype=dtype,
+        )
+        self.mlp = MLP(
+            config.hidden_size,
+            config.intermediate_size,
+            config.hidden_act,
+            **factory_kwargs,
+        )
+        self.input_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, **factory_kwargs
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, **factory_kwargs
+        )
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mha.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
+    def forward(self, hidden_states: torch.Tensor, inference_params=None, **kwargs):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.mha(hidden_states, inference_params)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        return residual + hidden_states
+class MambaDecoderLayer(nn.Module):
+    def __init__(
+        self, config: QwerkyLlamaMambaHybridConfig, layer_idx: int, device=None, dtype=None
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.mamba = Mamba(
+            d_model=config.d_model,
+            d_inner=config.d_inner,
+            d_xb=config.d_xb,
+            layer_idx=layer_idx,
+            **config.ssm_cfg,
+            **factory_kwargs,
+        )
+        self.mlp = MLP(
+            config.d_model,
+            config.intermediate_size,
+            config.hidden_act,
+            **factory_kwargs,
+        )
+        self.input_layernorm = RMSNorm(
+            config.d_model, eps=config.rms_norm_eps, **factory_kwargs
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.d_model, eps=config.rms_norm_eps, **factory_kwargs
+        )
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mamba.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
+    def forward(self, hidden_states: torch.Tensor, inference_params=None, **kwargs):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.mamba(hidden_states, inference_params=inference_params)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        return residual + hidden_states
+def merge_projections_for_layers(checkpoint, layer_indices):
+    """Merge q_proj, k_proj, v_proj into in_proj for attention layers."""
+    for layer_idx in layer_indices:
+        q_key = f"model.layers.{layer_idx}.self_attn.q_proj.weight"
+        k_key = f"model.layers.{layer_idx}.self_attn.k_proj.weight"
+        v_key = f"model.layers.{layer_idx}.self_attn.v_proj.weight"
+        o_key = f"model.layers.{layer_idx}.self_attn.o_proj.weight"
+        if all(k in checkpoint for k in [q_key, k_key, v_key]):
+            in_proj_weight = torch.cat(
+                [checkpoint[q_key], checkpoint[k_key], checkpoint[v_key]], dim=0
+            )
+            checkpoint[f"model.layers.{layer_idx}.mha.in_proj.weight"] = in_proj_weight
+            del checkpoint[q_key], checkpoint[k_key], checkpoint[v_key]
+        if o_key in checkpoint:
+            checkpoint[f"model.layers.{layer_idx}.mha.out_proj.weight"] = checkpoint[
+                o_key
+            ]
+            del checkpoint[o_key]
+    return checkpoint
+class QwerkyLlamaMambaHybridPreTrainedModel(PreTrainedModel):
+    config_class = QwerkyLlamaMambaHybridConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["MambaDecoderLayer", "MHADecoderLayer"]
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+class QwerkyLlamaMambaHybridModel(QwerkyLlamaMambaHybridPreTrainedModel):
+    def __init__(self, config: QwerkyLlamaMambaHybridConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [
+                MHADecoderLayer(config, i)
+                if i in config.attn_layers
+                else MambaDecoderLayer(config, i)
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self._register_load_state_dict_pre_hook(self.load_hook)
+        self.post_init()
+    def load_hook(self, state_dict, prefix, *args):
+        if self.config.attn_layers:
+            merge_projections_for_layers(state_dict, self.config.attn_layers)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        inference_params=None,
+        num_last_tokens: int = 0,
+        **kwargs,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Cannot specify both input_ids and inputs_embeds")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Must specify either input_ids or inputs_embeds")
+        hidden_states = (
+            inputs_embeds if inputs_embeds is not None else self.embed_tokens(input_ids)
+        )
+        if not hidden_states.is_contiguous():
+            hidden_states = hidden_states.contiguous()
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states, inference_params=inference_params, **kwargs
+            )
+        hidden_states = self.norm(hidden_states)
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        return hidden_states
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(
+                batch_size, max_seqlen, dtype=dtype, **kwargs
+            )
+            for i, layer in enumerate(self.layers)
+        }
+class QwerkyLlamaMambaHybridForCausalLM(
+    MambaGenerationMixin, QwerkyLlamaMambaHybridPreTrainedModel
+):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: QwerkyLlamaMambaHybridConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = QwerkyLlamaMambaHybridModel(config, **kwargs)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self._cached_device = None
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        inference_params=None,
+        num_last_tokens: int = 0,
+        **kwargs,
+    ) -> tuple | CausalLMOutput:
+        # Optimize TTFT: Only compute last token logits during prefill
+        is_prefill = (
+            labels is None
+            and inference_params is not None
+            and getattr(inference_params, "seqlen_offset", 0) == 0
+            and num_last_tokens == 0
+        )
+        if is_prefill:
+            num_last_tokens = 1
+        hidden_states = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            inference_params=inference_params,
+            num_last_tokens=num_last_tokens,
+            **kwargs,
+        )
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = CrossEntropyLoss()(
+                shift_logits.view(-1, self.config.vocab_size),
+                shift_labels.view(-1).to(shift_logits.device),
+            )
+        return CausalLMOutput(loss=loss, logits=logits)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.model.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
+    def generate(
+        self,
+        input_ids,
+        max_length=1024,
+        top_k=50,
+        top_p=1.0,
+        min_p=0.0,
+        temperature=1.0,
+        repetition_penalty=1.1,
+        return_dict_in_generate=False,
+        output_scores=False,
+        **kwargs,
+    ):
+        """
+        Generate sequences using the model.
+        Supports all standard Transformers generation parameters including:
+        - do_sample, temperature, top_k, top_p, repetition_penalty
+        - attention_mask, pad_token_id, eos_token_id
+        - max_new_tokens, use_cache, and more
+        """
+        # Ensure input_ids is properly shaped (2D: batch_size, seq_len)
+        if input_ids.dim() == 1:
+            input_ids = input_ids.unsqueeze(0)
+        # Ensure input_ids is on the correct device and dtype for generation
+        device = next(self.parameters()).device
+        if input_ids.device != device:
+            input_ids = input_ids.to(device)
+        if input_ids.dtype != torch.long:
+            input_ids = input_ids.long()
+        if kwargs is not None:
+            max_new_tokens = kwargs.pop("max_new_tokens", None)
+            if max_new_tokens is not None:
+                max_length = max_new_tokens + input_ids.shape[1]
+            do_sample = kwargs.pop("do_sample", True)
+            if not do_sample:
+                top_k, top_p, min_p = 1, 0.0, 0.0
+            cg = kwargs.pop("cg", True)
+            eos_token_id = kwargs.pop("eos_token_id", self.config.eos_token_id)
+            if eos_token_id is not None:
+                if isinstance(eos_token_id, (list, tuple)):
+                    eos_token_id = torch.tensor(
+                        eos_token_id, dtype=torch.long, device=device
+                    )
+                else:
+                    eos_token_id = torch.tensor(
+                        [eos_token_id], dtype=torch.long, device=device
+                    )
+            attention_mask = kwargs.pop("attention_mask", None)
+            pad_token_id = kwargs.pop(
+                "pad_token_id", getattr(self.config, "pad_token_id", None)
+            )
+            # Handle attention_mask by filtering input_ids if provided
+            # MambaGenerationMixin doesn't support attention_mask, so we filter instead
+            if attention_mask is not None:
+                seq_lengths = attention_mask.sum(dim=1)
+                max_seq_len = seq_lengths.max().item()
+                min_seq_len = seq_lengths.min().item()
+                original_seq_len = input_ids.shape[1]
+                if min_seq_len == max_seq_len and max_seq_len <= original_seq_len:
+                    input_ids = input_ids[:, :max_seq_len].contiguous()
+                else:
+                    batch_size = input_ids.shape[0]
+                    dtype = input_ids.dtype
+                    pad_value = pad_token_id if pad_token_id is not None else 0
+                    input_ids_filtered = torch.full(
+                        (batch_size, max_seq_len), pad_value, dtype=dtype, device=device
+                    )
+                    copy_len = min(max_seq_len, original_seq_len)
+                    if copy_len > 0:
+                        valid_mask = torch.arange(copy_len, device=device).unsqueeze(
+                            0
+                        ) < seq_lengths.unsqueeze(1)
+                        input_ids_slice = input_ids[:, :copy_len].contiguous()
+                        input_ids_filtered_slice = input_ids_filtered[:, :copy_len]
+                        input_ids_filtered[:, :copy_len] = torch.where(
+                            valid_mask, input_ids_slice, input_ids_filtered_slice
+                        )
+                    input_ids = input_ids_filtered.contiguous()
+            repetition_penalty = kwargs.pop("repetition_penalty", repetition_penalty)
+            # Remove unsupported kwargs
+            for key in [
+                "use_cache",
+                "no_repeat_ngram_size",
+                "length_penalty",
+                "num_return_sequences",
+                "num_beams",
+                "low_memory",
+                "stopping_criteria",
+            ]:
+                kwargs.pop(key, None)
+        return super().generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            cg=cg,
+            top_k=top_k,
+            top_p=top_p,
+            min_p=min_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            return_dict_in_generate=return_dict_in_generate,
+            output_scores=output_scores,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2064 @@

+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|finetune_right_pad_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|python_tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "extra_special_tokens": {},
+  "fix_mistral_regex": true,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}