ulmentflam commited on
Commit
13a30fe
·
verified ·
1 Parent(s): a0ce55b

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ QWERKY AI DISTILLED MODEL LICENSE AGREEMENT
2
+
3
+ This model is a distilled version created by QWERKY AI, Inc. and is subject to dual attribution requirements.
4
+
5
+ ================================================================================
6
+ ATTRIBUTION REQUIREMENTS
7
+ ================================================================================
8
+
9
+ This model is:
10
+ 1. Derived from Meta's Llama 3.1 model and subject to the Llama 3.1 Community License Agreement
11
+ 2. Distilled and optimized by QWERKY AI, Inc.
12
+
13
+ When using or redistributing this model, you must provide attribution to BOTH:
14
+ - Meta Platforms, Inc. for the original Llama 3.1 model
15
+ - QWERKY AI, Inc. for the distillation and optimization
16
+
17
+ Suggested attribution format:
18
+ "This model is based on Meta's Llama 3.1, distilled and optimized by QWERKY AI, Inc."
19
+
20
+ ================================================================================
21
+ ORIGINAL LLAMA 3.1 LICENSE TERMS
22
+ ================================================================================
23
+
24
+ This model inherits all terms and conditions from the Llama 3.1 Community License Agreement dated July 23, 2024, including but not limited to:
25
+
26
+ 1. USAGE RESTRICTIONS: If you have more than 700 million monthly active users, you must request a license from Meta.
27
+
28
+ 2. PROHIBITED USES: You may not use this model to:
29
+ - Violate laws or regulations
30
+ - Engage in harmful, abusive, or discriminatory activities
31
+ - Generate misinformation or harmful content
32
+
33
+ 3. DISTRIBUTION: Any redistribution must include:
34
+ - This complete license
35
+ - Attribution to both Meta and QWERKY AI
36
+ - The same use restrictions
37
+
38
+ The full Llama 3.1 Community License Agreement is incorporated by reference and included in the LLAMA_3.1_LICENSE.txt file in this repository. It is also available at: https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/LICENSE
39
+
40
+ ================================================================================
41
+ QWERKY AI ADDITIONAL TERMS
42
+ ================================================================================
43
+
44
+ In addition to the Llama 3.1 license terms, users must:
45
+
46
+ 1. ATTRIBUTION: Include clear attribution to QWERKY AI, Inc. in any:
47
+ - Academic papers or research
48
+ - Commercial products or services
49
+ - Public demonstrations or benchmarks
50
+ - Derivative works or fine-tuned versions
51
+
52
+ 2. QWERKY BRANDING: Do not imply endorsement by QWERKY AI without written permission
53
+
54
+ 3. PERFORMANCE CLAIMS: When citing performance metrics, clearly indicate:
55
+ - That this is a distilled version
56
+ - Any benchmarks are specific to this distilled model
57
+ - QWERKY AI's optimization techniques were applied
58
+
59
+ ================================================================================
60
+ WARRANTY DISCLAIMER
61
+ ================================================================================
62
+
63
+ THIS MODEL IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED.
64
+ NEITHER META PLATFORMS, INC. NOR QWERKY AI, INC. MAKE ANY WARRANTIES REGARDING
65
+ THE MODEL'S PERFORMANCE, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE.
66
+
67
+ ================================================================================
68
+
69
+ By using this model, you agree to all terms above.
70
+
71
+ Copyright (c) Meta Platforms, Inc. (Original Llama 3.1 Model)
72
+ Copyright (c) QWERKY AI, Inc. (Distillation and Optimization)
LLAMA_3.1_LICENSE.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LLAMA 3.1 COMMUNITY LICENSE AGREEMENT
2
+
3
+ Llama 3.1 Version Release Date: July 23, 2024
4
+
5
+ “Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein.
6
+
7
+ “Documentation” means the specifications, manuals and documentation accompanying Llama 3.1 distributed by Meta at https://llama.meta.com/doc/overview.
8
+
9
+ “Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
10
+
11
+ “Llama 3.1” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at https://llama.meta.com/llama-downloads.
12
+
13
+ “Llama Materials” means, collectively, Meta’s proprietary Llama 3.1 and Documentation (and any portion thereof) made available under this Agreement.
14
+
15
+ “Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland). 
16
+
17
+ By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement.
18
+
19
+ 1. License Rights and Redistribution.

20
+ a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials.  
21
+
22
+ b. Redistribution and Use.  
23
+
24
+ i. If you distribute or make available the Llama Materials (or any derivative works thereof), or a product or service (including another AI model) that contains any of them, you shall (A) provide a copy of this Agreement with any such Llama Materials; and (B) prominently display “Built with Llama” on a related website, user interface, blogpost, about page, or product documentation. If you use the Llama Materials or any outputs or results of the Llama Materials to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “Llama” at the beginning of any such AI model name.
25
+
26
+ ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you. 
27
+
28
+ iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Llama 3.1 is licensed under the Llama 3.1 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
29
+
30
+ iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://llama.meta.com/llama3_1/use-policy), which is hereby incorporated by reference into this Agreement.
31
+   
32
+ 2. Additional Commercial Terms. If, on the Llama 3.1 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights.
33
+
34
+ 3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
35
+
36
+ 4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
37
+  
38
+ 5. Intellectual Property.
39
+
40
+ a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials or as set forth in this Section 5(a). Meta hereby grants you a license to use “Llama” (the “Mark”) solely as required to comply with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/). All goodwill arising out of your use of the Mark will inure to the benefit of Meta.
41
+
42
+ b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
43
+
44
+ c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Llama 3.1 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials.
45
+
46
+ 6. Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement. 
47
+
48
+ 7. Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement. 
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ tags:
4
+ - qwerky
5
+ - mamba
6
+ - hybrid
7
+ - causal-lm
8
+ - text-generation
9
+ language:
10
+ - en
11
+ library_name: transformers
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # QwerkyLlamaMambaHybrid
16
+
17
+ Hybrid Mamba-Transformer model from Qwerky AI.
18
+
19
+ ## Requirements
20
+
21
+ - CUDA-compatible GPU
22
+ - Python 3.8+
23
+ - PyTorch 2.0+
24
+ - transformers, safetensors, mamba-ssm, causal-conv1d, flash-attn
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install transformers torch safetensors
30
+ pip install flash-attn mamba-ssm causal-conv1d --no-build-isolation
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ import torch
37
+ from transformers import AutoTokenizer, AutoModelForCausalLM
38
+
39
+ tokenizer = AutoTokenizer.from_pretrained("QwerkyAI/Qwick-8B-Instruct")
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ "QwerkyAI/Qwick-8B-Instruct",
42
+ torch_dtype=torch.bfloat16,
43
+ device_map="auto",
44
+ trust_remote_code=True
45
+ )
46
+
47
+ inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
48
+ outputs = model.generate(**inputs, max_new_tokens=50)
49
+ print(tokenizer.decode(outputs[0]))
50
+ ```
51
+
52
+ ## Model Files
53
+
54
+ - `config.json` - Model configuration with `auto_map`
55
+ - `modeling_qwerky_llama_mamba_hybrid.py` - Custom modeling class
56
+ - `configuration_qwerky_llama_mamba_hybrid.py` - Custom configuration class
57
+ - `model.safetensors` - Model weights
58
+
59
+ ## License
60
+
61
+ See LICENSE file for details.
chat_template.jinja ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
2
+
3
+ '+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
4
+
5
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "QwerkyLlamaMambaHybridForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 128000,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": [
10
+ 128001,
11
+ 128008,
12
+ 128009
13
+ ],
14
+ "head_dim": 128,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 4096,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 14336,
19
+ "mamba_version": "Mamba1",
20
+ "max_position_embeddings": 131072,
21
+ "mlp_bias": false,
22
+ "model_type": "qwerky_llama_mamba_hybrid",
23
+ "num_attention_heads": 32,
24
+ "num_hidden_layers": 32,
25
+ "num_key_value_heads": 8,
26
+ "pretraining_tp": 1,
27
+ "rms_norm_eps": 1e-05,
28
+ "rope_scaling": {
29
+ "factor": 8.0,
30
+ "high_freq_factor": 4.0,
31
+ "low_freq_factor": 1.0,
32
+ "original_max_position_embeddings": 8192,
33
+ "rope_type": "llama3"
34
+ },
35
+ "rope_theta": 500000.0,
36
+ "tie_word_embeddings": false,
37
+ "transformers_version": "4.57.6",
38
+ "use_cache": true,
39
+ "vocab_size": 128256,
40
+ "auto_map": {
41
+ "AutoConfig": "configuration_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridConfig",
42
+ "AutoModelForCausalLM": "modeling_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridForCausalLM"
43
+ },
44
+ "d_model": 4096,
45
+ "d_inner": 4096,
46
+ "d_xb": 1024,
47
+ "ssm_cfg": {
48
+ "expand": 1
49
+ },
50
+ "attn_layers": [
51
+ 3,
52
+ 8,
53
+ 13,
54
+ 18,
55
+ 23,
56
+ 27
57
+ ]
58
+ }
configuration_qwerky_llama_mamba_hybrid.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, Qwerky AI, Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """QwerkyLlamaMambaHybrid model configuration"""
15
+
16
+ from transformers.configuration_utils import PretrainedConfig
17
+ from transformers.utils import logging
18
+
19
+ logger = logging.get_logger(__name__)
20
+
21
+
22
+ class QwerkyLlamaMambaHybridConfig(PretrainedConfig):
23
+ r"""
24
+ Configuration class for QwerkyLlamaMambaHybrid model. Consolidates transformer and mamba configs.
25
+
26
+ Args:
27
+ vocab_size (`int`, *optional*, defaults to 32000):
28
+ Vocabulary size of the model.
29
+ hidden_size (`int`, *optional*, defaults to 4096):
30
+ Dimension of the hidden representations.
31
+ intermediate_size (`int`, *optional*, defaults to 11008):
32
+ Dimension of the MLP representations.
33
+ num_hidden_layers (`int`, *optional*, defaults to 32):
34
+ Number of hidden layers in the model.
35
+ num_attention_heads (`int`, *optional*, defaults to 32):
36
+ Number of attention heads for each attention layer.
37
+ num_key_value_heads (`int`, *optional*, defaults to 32):
38
+ Number of key-value heads for grouped query attention.
39
+ hidden_act (`str`, *optional*, defaults to "silu"):
40
+ The non-linear activation function in the MLP layers.
41
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
42
+ The maximum sequence length.
43
+ initializer_range (`float`, *optional*, defaults to 0.02):
44
+ The standard deviation for weight initialization.
45
+ rms_norm_eps (`float`, *optional*, defaults to 1e-6):
46
+ The epsilon used by the rms normalization layers.
47
+ use_cache (`bool`, *optional*, defaults to `True`):
48
+ Whether to return the last key/values attentions.
49
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
50
+ Whether to tie input and output word embeddings.
51
+ rope_theta (`float`, *optional*, defaults to 10000.0):
52
+ The base period of the RoPE embeddings.
53
+ rope_scaling (`dict`, *optional*):
54
+ RoPE scaling configuration.
55
+ attention_dropout (`float`, *optional*, defaults to 0.0):
56
+ Dropout ratio for attention probabilities.
57
+ d_model (`int`, *optional*):
58
+ Model dimension for Mamba layers. Defaults to `hidden_size`.
59
+ d_inner (`int`, *optional*):
60
+ Inner dimension for Mamba layers. Defaults to `intermediate_size`.
61
+ d_xb (`int`, *optional*, defaults to 2560):
62
+ Dimension for Mamba xB projection.
63
+ ssm_cfg (`dict`, *optional*, defaults to `{}`):
64
+ State space model configuration dictionary.
65
+ attn_layers (`List[int]`, *optional*, defaults to `[]`):
66
+ List of layer indices that use attention instead of Mamba.
67
+ """
68
+
69
+ model_type = "qwerky_llama_mamba_hybrid"
70
+ keys_to_ignore_at_inference = ["past_key_values"]
71
+
72
+ def __init__(
73
+ self,
74
+ vocab_size: int = 32000,
75
+ hidden_size: int = 4096,
76
+ intermediate_size: int = 11008,
77
+ num_hidden_layers: int = 32,
78
+ num_attention_heads: int = 32,
79
+ num_key_value_heads: int | None = None,
80
+ hidden_act: str = "silu",
81
+ max_position_embeddings: int = 2048,
82
+ initializer_range: float = 0.02,
83
+ rms_norm_eps: float = 1e-6,
84
+ use_cache: bool = True,
85
+ pad_token_id: int = 0,
86
+ bos_token_id: int = 1,
87
+ eos_token_id: int = 2,
88
+ tie_word_embeddings: bool = False,
89
+ rope_theta: float = 10000.0,
90
+ rope_scaling: dict | None = None,
91
+ attention_dropout: float = 0.0,
92
+ d_model: int | None = None,
93
+ d_inner: int | None = None,
94
+ d_xb: int = 2560,
95
+ ssm_cfg: dict | None = None,
96
+ attn_layers: list[int] | None = None,
97
+ **kwargs,
98
+ ):
99
+ self.vocab_size = vocab_size
100
+ self.hidden_size = hidden_size
101
+ self.intermediate_size = intermediate_size
102
+ self.num_hidden_layers = num_hidden_layers
103
+ self.num_attention_heads = num_attention_heads
104
+ self.num_key_value_heads = (
105
+ num_key_value_heads
106
+ if num_key_value_heads is not None
107
+ else num_attention_heads
108
+ )
109
+ self.hidden_act = hidden_act
110
+ self.max_position_embeddings = max_position_embeddings
111
+ self.initializer_range = initializer_range
112
+ self.rms_norm_eps = rms_norm_eps
113
+ self.use_cache = use_cache
114
+ self.rope_theta = rope_theta
115
+ self.rope_scaling = rope_scaling
116
+ self.attention_dropout = attention_dropout
117
+
118
+ # Mamba-specific parameters
119
+ self.d_model = d_model if d_model is not None else hidden_size
120
+ self.d_inner = d_inner if d_inner is not None else intermediate_size
121
+ self.d_xb = d_xb
122
+ self.ssm_cfg = ssm_cfg if ssm_cfg is not None else {}
123
+ self.attn_layers = attn_layers if attn_layers is not None else []
124
+
125
+ super().__init__(
126
+ pad_token_id=pad_token_id,
127
+ bos_token_id=bos_token_id,
128
+ eos_token_id=eos_token_id,
129
+ tie_word_embeddings=tie_word_embeddings,
130
+ **kwargs,
131
+ )
132
+
133
+ # Set auto_map for external code loading
134
+ if "auto_map" not in kwargs:
135
+ self.auto_map = {
136
+ "AutoConfig": "configuration_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridConfig",
137
+ "AutoModelForCausalLM": "modeling_qwerky_llama_mamba_hybrid.QwerkyLlamaMambaHybridForCausalLM",
138
+ }
139
+
140
+ # Set architectures field
141
+ if "architectures" not in kwargs:
142
+ self.architectures = ["QwerkyLlamaMambaHybridForCausalLM"]
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0289497a76fd7c60c1d67ac6ad5f15292a6bed639b3e40eaae580f578dfe45aa
3
+ size 4889922520
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27ea066e23b4f1a490acc597035c267088ecbea615a66f0cf8358f68b8785a8b
3
+ size 4900631584
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:365b2ecd3837b7a5dda64f03f879f1e7fce707f9e8c35878ea47cb3bf19ed125
3
+ size 4905031752
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01d67026c28c25d7303baa326f7fd19f8a881ed0380152bfdd51c5a2f93bfcdf
3
+ size 2351345248
model.safetensors.index.json ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 17046888448
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
7
+ "model.layers.0.mamba.A_log": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.mamba.D": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mamba.A_log": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mamba.D": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.2.mamba.A_log": "model-00001-of-00004.safetensors",
34
+ "model.layers.2.mamba.D": "model-00001-of-00004.safetensors",
35
+ "model.layers.2.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
36
+ "model.layers.2.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
37
+ "model.layers.2.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
38
+ "model.layers.2.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
39
+ "model.layers.2.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
40
+ "model.layers.2.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
41
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
42
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
43
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
44
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
45
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
46
+ "model.layers.3.mha.in_proj.weight": "model-00001-of-00004.safetensors",
47
+ "model.layers.3.mha.out_proj.weight": "model-00001-of-00004.safetensors",
48
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
49
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
50
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
51
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
52
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
53
+ "model.layers.4.mamba.A_log": "model-00001-of-00004.safetensors",
54
+ "model.layers.4.mamba.D": "model-00001-of-00004.safetensors",
55
+ "model.layers.4.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
56
+ "model.layers.4.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
57
+ "model.layers.4.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
58
+ "model.layers.4.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
59
+ "model.layers.4.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
60
+ "model.layers.4.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
61
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
62
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
63
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
64
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
65
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
66
+ "model.layers.5.mamba.A_log": "model-00001-of-00004.safetensors",
67
+ "model.layers.5.mamba.D": "model-00001-of-00004.safetensors",
68
+ "model.layers.5.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
69
+ "model.layers.5.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
70
+ "model.layers.5.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
71
+ "model.layers.5.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
72
+ "model.layers.5.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
73
+ "model.layers.5.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
74
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
75
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
76
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
77
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
78
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
79
+ "model.layers.6.mamba.A_log": "model-00001-of-00004.safetensors",
80
+ "model.layers.6.mamba.D": "model-00001-of-00004.safetensors",
81
+ "model.layers.6.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
82
+ "model.layers.6.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
83
+ "model.layers.6.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
84
+ "model.layers.6.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
85
+ "model.layers.6.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
86
+ "model.layers.6.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
87
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
88
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
89
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
90
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
91
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
92
+ "model.layers.7.mamba.A_log": "model-00001-of-00004.safetensors",
93
+ "model.layers.7.mamba.D": "model-00001-of-00004.safetensors",
94
+ "model.layers.7.mamba.conv1d.weight": "model-00001-of-00004.safetensors",
95
+ "model.layers.7.mamba.conv1d.bias": "model-00001-of-00004.safetensors",
96
+ "model.layers.7.mamba.in_proj.weight": "model-00001-of-00004.safetensors",
97
+ "model.layers.7.mamba.dt_proj.weight": "model-00001-of-00004.safetensors",
98
+ "model.layers.7.mamba.dt_proj.bias": "model-00001-of-00004.safetensors",
99
+ "model.layers.7.mamba.out_proj.weight": "model-00001-of-00004.safetensors",
100
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
101
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
102
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
103
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
104
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
105
+ "model.layers.8.mha.in_proj.weight": "model-00001-of-00004.safetensors",
106
+ "model.layers.8.mha.out_proj.weight": "model-00001-of-00004.safetensors",
107
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.9.mamba.A_log": "model-00002-of-00004.safetensors",
113
+ "model.layers.9.mamba.D": "model-00002-of-00004.safetensors",
114
+ "model.layers.9.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.9.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.9.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.9.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.9.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
119
+ "model.layers.9.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.10.mamba.A_log": "model-00002-of-00004.safetensors",
126
+ "model.layers.10.mamba.D": "model-00002-of-00004.safetensors",
127
+ "model.layers.10.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.10.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
129
+ "model.layers.10.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.10.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.10.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
132
+ "model.layers.10.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.11.mamba.A_log": "model-00002-of-00004.safetensors",
139
+ "model.layers.11.mamba.D": "model-00002-of-00004.safetensors",
140
+ "model.layers.11.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.11.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
142
+ "model.layers.11.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
143
+ "model.layers.11.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.11.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
145
+ "model.layers.11.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
146
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
147
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
148
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
150
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
151
+ "model.layers.12.mamba.A_log": "model-00002-of-00004.safetensors",
152
+ "model.layers.12.mamba.D": "model-00002-of-00004.safetensors",
153
+ "model.layers.12.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
154
+ "model.layers.12.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
155
+ "model.layers.12.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.12.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
157
+ "model.layers.12.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
158
+ "model.layers.12.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
160
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
162
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
163
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.13.mha.in_proj.weight": "model-00002-of-00004.safetensors",
165
+ "model.layers.13.mha.out_proj.weight": "model-00002-of-00004.safetensors",
166
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
167
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
168
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
169
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
170
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
171
+ "model.layers.14.mamba.A_log": "model-00002-of-00004.safetensors",
172
+ "model.layers.14.mamba.D": "model-00002-of-00004.safetensors",
173
+ "model.layers.14.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
174
+ "model.layers.14.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
175
+ "model.layers.14.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
176
+ "model.layers.14.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
177
+ "model.layers.14.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
178
+ "model.layers.14.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
179
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
180
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
181
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
184
+ "model.layers.15.mamba.A_log": "model-00002-of-00004.safetensors",
185
+ "model.layers.15.mamba.D": "model-00002-of-00004.safetensors",
186
+ "model.layers.15.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
187
+ "model.layers.15.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
188
+ "model.layers.15.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
189
+ "model.layers.15.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
190
+ "model.layers.15.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
191
+ "model.layers.15.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
192
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
193
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
194
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
195
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
196
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
197
+ "model.layers.16.mamba.A_log": "model-00002-of-00004.safetensors",
198
+ "model.layers.16.mamba.D": "model-00002-of-00004.safetensors",
199
+ "model.layers.16.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
200
+ "model.layers.16.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
201
+ "model.layers.16.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
202
+ "model.layers.16.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
203
+ "model.layers.16.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
204
+ "model.layers.16.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
205
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
206
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
207
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
208
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
209
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
210
+ "model.layers.17.mamba.A_log": "model-00002-of-00004.safetensors",
211
+ "model.layers.17.mamba.D": "model-00002-of-00004.safetensors",
212
+ "model.layers.17.mamba.conv1d.weight": "model-00002-of-00004.safetensors",
213
+ "model.layers.17.mamba.conv1d.bias": "model-00002-of-00004.safetensors",
214
+ "model.layers.17.mamba.in_proj.weight": "model-00002-of-00004.safetensors",
215
+ "model.layers.17.mamba.dt_proj.weight": "model-00002-of-00004.safetensors",
216
+ "model.layers.17.mamba.dt_proj.bias": "model-00002-of-00004.safetensors",
217
+ "model.layers.17.mamba.out_proj.weight": "model-00002-of-00004.safetensors",
218
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
219
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
220
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
221
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
222
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
223
+ "model.layers.18.mha.in_proj.weight": "model-00002-of-00004.safetensors",
224
+ "model.layers.18.mha.out_proj.weight": "model-00002-of-00004.safetensors",
225
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
226
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
227
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.19.mamba.A_log": "model-00003-of-00004.safetensors",
231
+ "model.layers.19.mamba.D": "model-00003-of-00004.safetensors",
232
+ "model.layers.19.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.19.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.19.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.19.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.19.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
237
+ "model.layers.19.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.20.mamba.A_log": "model-00003-of-00004.safetensors",
244
+ "model.layers.20.mamba.D": "model-00003-of-00004.safetensors",
245
+ "model.layers.20.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.20.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.20.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.20.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.20.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
250
+ "model.layers.20.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.21.mamba.A_log": "model-00003-of-00004.safetensors",
257
+ "model.layers.21.mamba.D": "model-00003-of-00004.safetensors",
258
+ "model.layers.21.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.21.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.21.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.21.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.21.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
263
+ "model.layers.21.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
265
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
266
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
267
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
268
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.22.mamba.A_log": "model-00003-of-00004.safetensors",
270
+ "model.layers.22.mamba.D": "model-00003-of-00004.safetensors",
271
+ "model.layers.22.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
272
+ "model.layers.22.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
273
+ "model.layers.22.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.22.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.22.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
276
+ "model.layers.22.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
279
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.23.mha.in_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.23.mha.out_proj.weight": "model-00003-of-00004.safetensors",
284
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
286
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
288
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.24.mamba.A_log": "model-00003-of-00004.safetensors",
290
+ "model.layers.24.mamba.D": "model-00003-of-00004.safetensors",
291
+ "model.layers.24.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.24.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
293
+ "model.layers.24.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.24.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.24.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
296
+ "model.layers.24.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
297
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.25.mamba.A_log": "model-00003-of-00004.safetensors",
303
+ "model.layers.25.mamba.D": "model-00003-of-00004.safetensors",
304
+ "model.layers.25.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.25.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
306
+ "model.layers.25.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.25.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.25.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
309
+ "model.layers.25.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
315
+ "model.layers.26.mamba.A_log": "model-00003-of-00004.safetensors",
316
+ "model.layers.26.mamba.D": "model-00003-of-00004.safetensors",
317
+ "model.layers.26.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
318
+ "model.layers.26.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
319
+ "model.layers.26.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
320
+ "model.layers.26.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.26.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
322
+ "model.layers.26.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
327
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.27.mha.in_proj.weight": "model-00003-of-00004.safetensors",
329
+ "model.layers.27.mha.out_proj.weight": "model-00003-of-00004.safetensors",
330
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
331
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
332
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
333
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
335
+ "model.layers.28.mamba.A_log": "model-00003-of-00004.safetensors",
336
+ "model.layers.28.mamba.D": "model-00003-of-00004.safetensors",
337
+ "model.layers.28.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.28.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
339
+ "model.layers.28.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
340
+ "model.layers.28.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.28.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
342
+ "model.layers.28.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
343
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
344
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
345
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
346
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
347
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
348
+ "model.layers.29.mamba.A_log": "model-00003-of-00004.safetensors",
349
+ "model.layers.29.mamba.D": "model-00003-of-00004.safetensors",
350
+ "model.layers.29.mamba.conv1d.weight": "model-00003-of-00004.safetensors",
351
+ "model.layers.29.mamba.conv1d.bias": "model-00003-of-00004.safetensors",
352
+ "model.layers.29.mamba.in_proj.weight": "model-00003-of-00004.safetensors",
353
+ "model.layers.29.mamba.dt_proj.weight": "model-00003-of-00004.safetensors",
354
+ "model.layers.29.mamba.dt_proj.bias": "model-00003-of-00004.safetensors",
355
+ "model.layers.29.mamba.out_proj.weight": "model-00003-of-00004.safetensors",
356
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
357
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
358
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
359
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
360
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
361
+ "model.layers.30.mamba.A_log": "model-00004-of-00004.safetensors",
362
+ "model.layers.30.mamba.D": "model-00004-of-00004.safetensors",
363
+ "model.layers.30.mamba.conv1d.weight": "model-00004-of-00004.safetensors",
364
+ "model.layers.30.mamba.conv1d.bias": "model-00004-of-00004.safetensors",
365
+ "model.layers.30.mamba.in_proj.weight": "model-00004-of-00004.safetensors",
366
+ "model.layers.30.mamba.dt_proj.weight": "model-00004-of-00004.safetensors",
367
+ "model.layers.30.mamba.dt_proj.bias": "model-00004-of-00004.safetensors",
368
+ "model.layers.30.mamba.out_proj.weight": "model-00004-of-00004.safetensors",
369
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
370
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
371
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
372
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
373
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
374
+ "model.layers.31.mamba.A_log": "model-00004-of-00004.safetensors",
375
+ "model.layers.31.mamba.D": "model-00004-of-00004.safetensors",
376
+ "model.layers.31.mamba.conv1d.weight": "model-00004-of-00004.safetensors",
377
+ "model.layers.31.mamba.conv1d.bias": "model-00004-of-00004.safetensors",
378
+ "model.layers.31.mamba.in_proj.weight": "model-00004-of-00004.safetensors",
379
+ "model.layers.31.mamba.dt_proj.weight": "model-00004-of-00004.safetensors",
380
+ "model.layers.31.mamba.dt_proj.bias": "model-00004-of-00004.safetensors",
381
+ "model.layers.31.mamba.out_proj.weight": "model-00004-of-00004.safetensors",
382
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
383
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
384
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
385
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
386
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
387
+ "model.norm.weight": "model-00004-of-00004.safetensors",
388
+ "lm_head.weight": "model-00004-of-00004.safetensors"
389
+ }
390
+ }
modeling_qwerky_llama_mamba_hybrid.py ADDED
@@ -0,0 +1,768 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2026, Qwerky AI Inc. All rights reserved.
2
+ #
3
+ # Licensed under the Qwerky Distilled Model License Agreement (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # See the LICENSE file in this repository
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """PyTorch QwerkyLlamaMambaHybrid model for inference."""
15
+
16
+ import math
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ import torch.nn.functional as F
21
+ from einops import rearrange, repeat
22
+ from mamba_ssm.modules.mha import MHA
23
+ from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
24
+ from mamba_ssm.ops.triton.layer_norm import RMSNorm
25
+ from mamba_ssm.utils.generation import GenerationMixin as MambaGenerationMixin
26
+ from torch.nn import CrossEntropyLoss
27
+ from transformers.activations import ACT2FN
28
+ from transformers.modeling_outputs import CausalLMOutput
29
+ from transformers.modeling_utils import PreTrainedModel
30
+ from transformers.utils import logging
31
+
32
+ try:
33
+ from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
34
+ except ImportError:
35
+ causal_conv1d_fn, causal_conv1d_update = None, None
36
+
37
+ try:
38
+ from mamba_ssm.ops.triton.selective_state_update import selective_state_update
39
+ except ImportError:
40
+ selective_state_update = None
41
+
42
+ from .configuration_qwerky_llama_mamba_hybrid import QwerkyLlamaMambaHybridConfig
43
+
44
+ logger = logging.get_logger(__name__)
45
+
46
+
47
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
48
+ """Repeat KV heads to match number of attention heads."""
49
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
50
+ if n_rep == 1:
51
+ return hidden_states
52
+ hidden_states = hidden_states[:, :, None, :, :].expand(
53
+ batch, num_key_value_heads, n_rep, slen, head_dim
54
+ )
55
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
56
+
57
+
58
+ class Mamba(nn.Module):
59
+ """Mamba SSM layer implementation."""
60
+
61
+ def __init__(
62
+ self,
63
+ d_model,
64
+ d_inner,
65
+ d_xb,
66
+ d_state=16,
67
+ d_conv=4,
68
+ expand=2,
69
+ dt_rank="auto",
70
+ dt_min=0.001,
71
+ dt_max=0.1,
72
+ dt_init="random",
73
+ dt_scale=1.0,
74
+ dt_init_floor=1e-4,
75
+ repeat_kv_before_conv=True,
76
+ conv_bias=True,
77
+ out_proj_bias=False,
78
+ use_fast_path=True,
79
+ layer_idx=None,
80
+ device=None,
81
+ dtype=None,
82
+ **kwargs,
83
+ ):
84
+ factory_kwargs = {"device": device, "dtype": dtype}
85
+ super().__init__()
86
+ self.d_model = d_model
87
+ self.d_xb = d_xb
88
+ self.d_state = d_state
89
+ self.d_conv = d_conv
90
+ self.expand = expand
91
+ self.d_inner = (
92
+ d_inner if d_inner is not None else int(self.expand * self.d_model)
93
+ )
94
+ self.dt_rank: int = (
95
+ math.ceil(self.d_model / 16) if dt_rank == "auto" else int(dt_rank)
96
+ )
97
+ self.use_fast_path = use_fast_path
98
+ self.layer_idx = layer_idx
99
+ self.repeat_kv_before_conv = repeat_kv_before_conv
100
+
101
+ conv_dim = self.d_inner if self.repeat_kv_before_conv else self.d_xb
102
+ self.conv1d = nn.Conv1d(
103
+ in_channels=conv_dim,
104
+ out_channels=conv_dim,
105
+ bias=conv_bias,
106
+ kernel_size=d_conv,
107
+ groups=conv_dim,
108
+ padding=d_conv - 1,
109
+ **factory_kwargs,
110
+ )
111
+
112
+ self.activation = "silu"
113
+ self.act = nn.SiLU()
114
+
115
+ self.num_xb_head = self.d_xb // self.d_state
116
+ self.num_C_head = self.d_inner // self.d_state
117
+ self.repeat_group = self.num_C_head // self.num_xb_head
118
+
119
+ self.in_proj = nn.Linear(
120
+ self.d_model,
121
+ 2 * self.d_xb + 2 * self.d_inner + self.dt_rank,
122
+ bias=False,
123
+ **factory_kwargs,
124
+ )
125
+ self.dt_proj = nn.Linear(
126
+ self.dt_rank, self.d_inner, bias=True, **factory_kwargs
127
+ )
128
+
129
+ # Initialize dt projection
130
+ dt_init_std = self.dt_rank**-0.5 * dt_scale
131
+ if dt_init == "constant":
132
+ nn.init.constant_(self.dt_proj.weight, dt_init_std)
133
+ elif dt_init == "random":
134
+ nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
135
+
136
+ # Initialize dt bias
137
+ dt = torch.exp(
138
+ torch.rand(self.d_inner, **factory_kwargs)
139
+ * (math.log(dt_max) - math.log(dt_min))
140
+ + math.log(dt_min)
141
+ ).clamp(min=dt_init_floor)
142
+ inv_dt = dt + torch.log(-torch.expm1(-dt))
143
+ with torch.no_grad():
144
+ self.dt_proj.bias.copy_(inv_dt)
145
+ self.dt_proj.bias._no_reinit = True # type: ignore[attr-defined]
146
+
147
+ # S4D real initialization
148
+ A = repeat(
149
+ torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
150
+ "n -> d n",
151
+ d=self.d_inner,
152
+ ).contiguous()
153
+ A_log = torch.log(A)
154
+ self.A_log = nn.Parameter(A_log)
155
+ self.A_log._no_weight_decay = True # type: ignore[attr-defined]
156
+
157
+ self.D = nn.Parameter(torch.ones(self.d_inner, device=device))
158
+ self.D._no_weight_decay = True # type: ignore[attr-defined]
159
+
160
+ self.out_proj = nn.Linear(
161
+ self.d_inner, self.d_model, bias=out_proj_bias, **factory_kwargs
162
+ )
163
+
164
+ def forward(self, hidden_states, inference_params=None):
165
+ batch, seqlen, dim = hidden_states.shape
166
+
167
+ conv_state, ssm_state = None, None
168
+ if inference_params is not None:
169
+ conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
170
+ if inference_params.seqlen_offset > 0:
171
+ out, _, _ = self.step(hidden_states, conv_state, ssm_state)
172
+ return out
173
+
174
+ A = -torch.exp(self.A_log.float())
175
+
176
+ if not hidden_states.is_contiguous():
177
+ hidden_states = hidden_states.contiguous()
178
+
179
+ zxbcdt = self.in_proj(hidden_states)
180
+ z, x, B, C, dt = torch.split(
181
+ zxbcdt,
182
+ [self.d_inner, self.d_xb, self.d_xb, self.d_inner, self.dt_rank],
183
+ dim=-1,
184
+ )
185
+
186
+ x = rearrange(x, "b l d -> b d l")
187
+ z = rearrange(z, "b l d -> b d l")
188
+
189
+ B = rearrange(
190
+ B, "b l (n_group dstate) -> b n_group l dstate", dstate=self.d_state
191
+ )
192
+ B = repeat_kv(B, self.repeat_group)
193
+ B = rearrange(B, "b n_group l dstate -> b n_group dstate l").contiguous()
194
+ C = rearrange(
195
+ C, "b l (n_group dstate) -> b n_group dstate l", dstate=self.d_state
196
+ ).contiguous()
197
+
198
+ dt = self.dt_proj(dt)
199
+ dt = rearrange(dt, "b l d -> b d l")
200
+
201
+ if self.repeat_kv_before_conv:
202
+ x = rearrange(
203
+ x, "b (n_group dstate) l -> b n_group l dstate", dstate=self.d_state
204
+ )
205
+ x = repeat_kv(x, self.repeat_group)
206
+ x = rearrange(x, "b n_group l dstate -> b (n_group dstate) l")
207
+
208
+ # Conv state update
209
+ if conv_state is not None:
210
+ if x.shape[-1] >= self.d_conv:
211
+ conv_state.copy_(x[:, :, -self.d_conv :])
212
+ else:
213
+ conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))
214
+
215
+ if causal_conv1d_fn is None:
216
+ x = self.act(self.conv1d(x)[..., :seqlen])
217
+ else:
218
+ x = causal_conv1d_fn(
219
+ x=x,
220
+ weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
221
+ bias=self.conv1d.bias,
222
+ activation=self.activation,
223
+ )
224
+
225
+ if not self.repeat_kv_before_conv:
226
+ x = rearrange(
227
+ x, "b (n_group dstate) l -> b n_group l dstate", dstate=self.d_state
228
+ )
229
+ x = repeat_kv(x, self.repeat_group)
230
+ x = rearrange(x, "b n_group l dstate -> b (n_group dstate) l")
231
+
232
+ return_last_state = ssm_state is not None
233
+ y = selective_scan_fn(
234
+ x,
235
+ dt,
236
+ A,
237
+ B,
238
+ C,
239
+ self.D.float(),
240
+ z=z,
241
+ delta_bias=self.dt_proj.bias.float(),
242
+ delta_softplus=True,
243
+ return_last_state=return_last_state,
244
+ )
245
+ if return_last_state:
246
+ y, last_state = y
247
+ ssm_state.copy_(
248
+ rearrange(last_state, "b (h d) n -> b h d n", h=self.num_C_head)
249
+ )
250
+
251
+ y = rearrange(y, "b d l -> b l d")
252
+ return self.out_proj(y)
253
+
254
+ def step(self, hidden_states, conv_state, ssm_state):
255
+ dtype = hidden_states.dtype
256
+ hidden_states_input = hidden_states.squeeze(1)
257
+ A = -torch.exp(self.A_log.float())
258
+
259
+ zxbcdt = self.in_proj(hidden_states_input)
260
+ z, x, B, C, dt = torch.split(
261
+ zxbcdt,
262
+ [self.d_inner, self.d_xb, self.d_xb, self.d_inner, self.dt_rank],
263
+ dim=-1,
264
+ )
265
+
266
+ B = rearrange(B, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state)
267
+ B = torch.repeat_interleave(B, dim=1, repeats=self.repeat_group)
268
+ C = rearrange(
269
+ C, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state
270
+ ).contiguous()
271
+ dt = self.dt_proj(dt)
272
+
273
+ if self.repeat_kv_before_conv:
274
+ x = rearrange(
275
+ x, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state
276
+ )
277
+ x = torch.repeat_interleave(x, dim=1, repeats=self.repeat_group)
278
+ x = rearrange(x, "b n_group dstate -> b (n_group dstate)")
279
+
280
+ if causal_conv1d_update is None:
281
+ conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))
282
+ conv_state[:, :, -1] = x
283
+ x = torch.sum(
284
+ conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1
285
+ )
286
+ if self.conv1d.bias is not None:
287
+ x = x + self.conv1d.bias
288
+ x = self.act(x).to(dtype=dtype)
289
+ else:
290
+ x = causal_conv1d_update(
291
+ x,
292
+ conv_state,
293
+ rearrange(self.conv1d.weight, "d 1 w -> d w"),
294
+ self.conv1d.bias,
295
+ self.activation,
296
+ )
297
+
298
+ if not self.repeat_kv_before_conv:
299
+ x = rearrange(
300
+ x, "b (n_group dstate) -> b n_group dstate", dstate=self.d_state
301
+ )
302
+ x = torch.repeat_interleave(x, dim=1, repeats=self.repeat_group)
303
+ x = rearrange(x, "b n_group dstate -> b (n_group dstate)")
304
+
305
+ x = rearrange(x, "b (h d) -> b h d", h=self.num_C_head)
306
+ dt = rearrange(dt, "b (h d) -> b h d", h=self.num_C_head)
307
+ A = rearrange(A, "(h d) n -> h d n", h=self.num_C_head)
308
+ D = rearrange(self.D, "(h d) -> h d", h=self.num_C_head)
309
+ z = rearrange(z, "b (h d) -> b h d", h=self.num_C_head)
310
+ dt_bias = rearrange(self.dt_proj.bias, "(h d) -> h d", h=self.num_C_head)
311
+
312
+ if selective_state_update is None:
313
+ raise RuntimeError(
314
+ "selective_state_update is not available. "
315
+ "Please install mamba-ssm with CUDA support: "
316
+ "pip install mamba-ssm causal-conv1d --no-build-isolation"
317
+ )
318
+ y = selective_state_update(
319
+ ssm_state, x, dt, A, B, C, D, z=z, dt_bias=dt_bias, dt_softplus=True
320
+ )
321
+ y = rearrange(y, "b h d -> b (h d)")
322
+ return self.out_proj(y).unsqueeze(1), conv_state, ssm_state
323
+
324
+ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
325
+ device = self.out_proj.weight.device
326
+ conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
327
+ conv_dim = self.d_inner if self.repeat_kv_before_conv else self.d_xb
328
+ conv_state = torch.zeros(
329
+ batch_size, conv_dim, self.d_conv, device=device, dtype=conv_dtype
330
+ )
331
+ ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
332
+ ssm_state = torch.zeros(
333
+ batch_size,
334
+ self.num_C_head,
335
+ self.d_inner // self.num_C_head,
336
+ self.d_state,
337
+ device=device,
338
+ dtype=ssm_dtype,
339
+ )
340
+ return conv_state, ssm_state
341
+
342
+ def _get_states_from_cache(
343
+ self, inference_params, batch_size, initialize_states=False
344
+ ):
345
+ if self.layer_idx not in inference_params.key_value_memory_dict:
346
+ conv_state, ssm_state = self.allocate_inference_cache(batch_size, 0)
347
+ inference_params.key_value_memory_dict[self.layer_idx] = (
348
+ conv_state,
349
+ ssm_state,
350
+ )
351
+ else:
352
+ conv_state, ssm_state = inference_params.key_value_memory_dict[
353
+ self.layer_idx
354
+ ]
355
+ if initialize_states:
356
+ conv_state.zero_()
357
+ ssm_state.zero_()
358
+ return conv_state, ssm_state
359
+
360
+
361
+ class MLP(nn.Module):
362
+ def __init__(self, d_model, intermediate_size, hidden_act, device=None, dtype=None):
363
+ factory_kwargs = {"device": device, "dtype": dtype}
364
+ super().__init__()
365
+ self.gate_proj = nn.Linear(
366
+ d_model, intermediate_size, bias=False, **factory_kwargs
367
+ )
368
+ self.up_proj = nn.Linear(
369
+ d_model, intermediate_size, bias=False, **factory_kwargs
370
+ )
371
+ self.down_proj = nn.Linear(
372
+ intermediate_size, d_model, bias=False, **factory_kwargs
373
+ )
374
+ self.act_fn = ACT2FN[hidden_act]
375
+
376
+ def forward(self, x):
377
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
378
+
379
+
380
+ class MHADecoderLayer(nn.Module):
381
+ def __init__(
382
+ self, config: QwerkyLlamaMambaHybridConfig, layer_idx: int, device=None, dtype=None
383
+ ):
384
+ factory_kwargs = {"device": device, "dtype": dtype}
385
+ super().__init__()
386
+ self.layer_idx = layer_idx
387
+ self.mha = MHA(
388
+ embed_dim=config.hidden_size,
389
+ num_heads=config.num_attention_heads,
390
+ num_heads_kv=config.num_key_value_heads,
391
+ layer_idx=layer_idx,
392
+ mlp_dim=0,
393
+ qkv_proj_bias=False,
394
+ out_proj_bias=False,
395
+ rotary_emb_dim=config.hidden_size // config.num_attention_heads,
396
+ rotary_emb_base=config.rope_theta,
397
+ causal=True,
398
+ device=device,
399
+ dtype=dtype,
400
+ )
401
+ self.mlp = MLP(
402
+ config.hidden_size,
403
+ config.intermediate_size,
404
+ config.hidden_act,
405
+ **factory_kwargs,
406
+ )
407
+ self.input_layernorm = RMSNorm(
408
+ config.hidden_size, eps=config.rms_norm_eps, **factory_kwargs
409
+ )
410
+ self.post_attention_layernorm = RMSNorm(
411
+ config.hidden_size, eps=config.rms_norm_eps, **factory_kwargs
412
+ )
413
+
414
+ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
415
+ return self.mha.allocate_inference_cache(
416
+ batch_size, max_seqlen, dtype=dtype, **kwargs
417
+ )
418
+
419
+ def forward(self, hidden_states: torch.Tensor, inference_params=None, **kwargs):
420
+ residual = hidden_states
421
+ hidden_states = self.input_layernorm(hidden_states)
422
+ hidden_states = self.mha(hidden_states, inference_params)
423
+ hidden_states = residual + hidden_states
424
+
425
+ residual = hidden_states
426
+ hidden_states = self.post_attention_layernorm(hidden_states)
427
+ hidden_states = self.mlp(hidden_states)
428
+ return residual + hidden_states
429
+
430
+
431
+ class MambaDecoderLayer(nn.Module):
432
+ def __init__(
433
+ self, config: QwerkyLlamaMambaHybridConfig, layer_idx: int, device=None, dtype=None
434
+ ):
435
+ factory_kwargs = {"device": device, "dtype": dtype}
436
+ super().__init__()
437
+ self.layer_idx = layer_idx
438
+ self.mamba = Mamba(
439
+ d_model=config.d_model,
440
+ d_inner=config.d_inner,
441
+ d_xb=config.d_xb,
442
+ layer_idx=layer_idx,
443
+ **config.ssm_cfg,
444
+ **factory_kwargs,
445
+ )
446
+ self.mlp = MLP(
447
+ config.d_model,
448
+ config.intermediate_size,
449
+ config.hidden_act,
450
+ **factory_kwargs,
451
+ )
452
+ self.input_layernorm = RMSNorm(
453
+ config.d_model, eps=config.rms_norm_eps, **factory_kwargs
454
+ )
455
+ self.post_attention_layernorm = RMSNorm(
456
+ config.d_model, eps=config.rms_norm_eps, **factory_kwargs
457
+ )
458
+
459
+ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
460
+ return self.mamba.allocate_inference_cache(
461
+ batch_size, max_seqlen, dtype=dtype, **kwargs
462
+ )
463
+
464
+ def forward(self, hidden_states: torch.Tensor, inference_params=None, **kwargs):
465
+ residual = hidden_states
466
+ hidden_states = self.input_layernorm(hidden_states)
467
+ hidden_states = self.mamba(hidden_states, inference_params=inference_params)
468
+ hidden_states = residual + hidden_states
469
+
470
+ residual = hidden_states
471
+ hidden_states = self.post_attention_layernorm(hidden_states)
472
+ hidden_states = self.mlp(hidden_states)
473
+ return residual + hidden_states
474
+
475
+
476
+ def merge_projections_for_layers(checkpoint, layer_indices):
477
+ """Merge q_proj, k_proj, v_proj into in_proj for attention layers."""
478
+ for layer_idx in layer_indices:
479
+ q_key = f"model.layers.{layer_idx}.self_attn.q_proj.weight"
480
+ k_key = f"model.layers.{layer_idx}.self_attn.k_proj.weight"
481
+ v_key = f"model.layers.{layer_idx}.self_attn.v_proj.weight"
482
+ o_key = f"model.layers.{layer_idx}.self_attn.o_proj.weight"
483
+
484
+ if all(k in checkpoint for k in [q_key, k_key, v_key]):
485
+ in_proj_weight = torch.cat(
486
+ [checkpoint[q_key], checkpoint[k_key], checkpoint[v_key]], dim=0
487
+ )
488
+ checkpoint[f"model.layers.{layer_idx}.mha.in_proj.weight"] = in_proj_weight
489
+ del checkpoint[q_key], checkpoint[k_key], checkpoint[v_key]
490
+
491
+ if o_key in checkpoint:
492
+ checkpoint[f"model.layers.{layer_idx}.mha.out_proj.weight"] = checkpoint[
493
+ o_key
494
+ ]
495
+ del checkpoint[o_key]
496
+ return checkpoint
497
+
498
+
499
+ class QwerkyLlamaMambaHybridPreTrainedModel(PreTrainedModel):
500
+ config_class = QwerkyLlamaMambaHybridConfig
501
+ base_model_prefix = "model"
502
+ supports_gradient_checkpointing = False
503
+ _no_split_modules = ["MambaDecoderLayer", "MHADecoderLayer"]
504
+ _supports_flash_attn_2 = True
505
+
506
+ def _init_weights(self, module):
507
+ if isinstance(module, nn.Linear):
508
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
509
+ if module.bias is not None:
510
+ module.bias.data.zero_()
511
+ elif isinstance(module, nn.Embedding):
512
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
513
+
514
+
515
+ class QwerkyLlamaMambaHybridModel(QwerkyLlamaMambaHybridPreTrainedModel):
516
+ def __init__(self, config: QwerkyLlamaMambaHybridConfig, **kwargs):
517
+ super().__init__(config, **kwargs)
518
+ self.config = config
519
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
520
+ self.layers = nn.ModuleList(
521
+ [
522
+ MHADecoderLayer(config, i)
523
+ if i in config.attn_layers
524
+ else MambaDecoderLayer(config, i)
525
+ for i in range(config.num_hidden_layers)
526
+ ]
527
+ )
528
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
529
+ self._register_load_state_dict_pre_hook(self.load_hook)
530
+ self.post_init()
531
+
532
+ def load_hook(self, state_dict, prefix, *args):
533
+ if self.config.attn_layers:
534
+ merge_projections_for_layers(state_dict, self.config.attn_layers)
535
+
536
+ def get_input_embeddings(self):
537
+ return self.embed_tokens
538
+
539
+ def set_input_embeddings(self, value):
540
+ self.embed_tokens = value
541
+
542
+ def forward(
543
+ self,
544
+ input_ids: torch.LongTensor | None = None,
545
+ inputs_embeds: torch.FloatTensor | None = None,
546
+ inference_params=None,
547
+ num_last_tokens: int = 0,
548
+ **kwargs,
549
+ ):
550
+ if input_ids is not None and inputs_embeds is not None:
551
+ raise ValueError("Cannot specify both input_ids and inputs_embeds")
552
+ if input_ids is None and inputs_embeds is None:
553
+ raise ValueError("Must specify either input_ids or inputs_embeds")
554
+
555
+ hidden_states = (
556
+ inputs_embeds if inputs_embeds is not None else self.embed_tokens(input_ids)
557
+ )
558
+ if not hidden_states.is_contiguous():
559
+ hidden_states = hidden_states.contiguous()
560
+
561
+ for layer in self.layers:
562
+ hidden_states = layer(
563
+ hidden_states, inference_params=inference_params, **kwargs
564
+ )
565
+
566
+ hidden_states = self.norm(hidden_states)
567
+ if num_last_tokens > 0:
568
+ hidden_states = hidden_states[:, -num_last_tokens:]
569
+ return hidden_states
570
+
571
+ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
572
+ return {
573
+ i: layer.allocate_inference_cache(
574
+ batch_size, max_seqlen, dtype=dtype, **kwargs
575
+ )
576
+ for i, layer in enumerate(self.layers)
577
+ }
578
+
579
+
580
+ class QwerkyLlamaMambaHybridForCausalLM(
581
+ MambaGenerationMixin, QwerkyLlamaMambaHybridPreTrainedModel
582
+ ):
583
+ _tied_weights_keys = ["lm_head.weight"]
584
+
585
+ def __init__(self, config: QwerkyLlamaMambaHybridConfig, **kwargs):
586
+ super().__init__(config, **kwargs)
587
+ self.model = QwerkyLlamaMambaHybridModel(config, **kwargs)
588
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
589
+ if config.tie_word_embeddings:
590
+ self.lm_head.weight = self.model.embed_tokens.weight
591
+ self._cached_device = None
592
+ self.post_init()
593
+
594
+ def get_input_embeddings(self):
595
+ return self.model.get_input_embeddings()
596
+
597
+ def set_input_embeddings(self, value):
598
+ self.model.set_input_embeddings(value)
599
+
600
+ def get_output_embeddings(self):
601
+ return self.lm_head
602
+
603
+ def set_output_embeddings(self, new_embeddings):
604
+ self.lm_head = new_embeddings
605
+
606
+ def forward(
607
+ self,
608
+ input_ids: torch.LongTensor | None = None,
609
+ inputs_embeds: torch.FloatTensor | None = None,
610
+ labels: torch.LongTensor | None = None,
611
+ inference_params=None,
612
+ num_last_tokens: int = 0,
613
+ **kwargs,
614
+ ) -> tuple | CausalLMOutput:
615
+ # Optimize TTFT: Only compute last token logits during prefill
616
+ is_prefill = (
617
+ labels is None
618
+ and inference_params is not None
619
+ and getattr(inference_params, "seqlen_offset", 0) == 0
620
+ and num_last_tokens == 0
621
+ )
622
+ if is_prefill:
623
+ num_last_tokens = 1
624
+
625
+ hidden_states = self.model(
626
+ input_ids=input_ids,
627
+ inputs_embeds=inputs_embeds,
628
+ inference_params=inference_params,
629
+ num_last_tokens=num_last_tokens,
630
+ **kwargs,
631
+ )
632
+ logits = self.lm_head(hidden_states)
633
+
634
+ loss = None
635
+ if labels is not None:
636
+ shift_logits = logits[..., :-1, :].contiguous()
637
+ shift_labels = labels[..., 1:].contiguous()
638
+ loss = CrossEntropyLoss()(
639
+ shift_logits.view(-1, self.config.vocab_size),
640
+ shift_labels.view(-1).to(shift_logits.device),
641
+ )
642
+
643
+ return CausalLMOutput(loss=loss, logits=logits)
644
+
645
+ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
646
+ return self.model.allocate_inference_cache(
647
+ batch_size, max_seqlen, dtype=dtype, **kwargs
648
+ )
649
+
650
+ def generate(
651
+ self,
652
+ input_ids,
653
+ max_length=1024,
654
+ top_k=50,
655
+ top_p=1.0,
656
+ min_p=0.0,
657
+ temperature=1.0,
658
+ repetition_penalty=1.1,
659
+ return_dict_in_generate=False,
660
+ output_scores=False,
661
+ **kwargs,
662
+ ):
663
+ """
664
+ Generate sequences using the model.
665
+
666
+ Supports all standard Transformers generation parameters including:
667
+ - do_sample, temperature, top_k, top_p, repetition_penalty
668
+ - attention_mask, pad_token_id, eos_token_id
669
+ - max_new_tokens, use_cache, and more
670
+ """
671
+ # Ensure input_ids is properly shaped (2D: batch_size, seq_len)
672
+ if input_ids.dim() == 1:
673
+ input_ids = input_ids.unsqueeze(0)
674
+
675
+ # Ensure input_ids is on the correct device and dtype for generation
676
+ device = next(self.parameters()).device
677
+ if input_ids.device != device:
678
+ input_ids = input_ids.to(device)
679
+ if input_ids.dtype != torch.long:
680
+ input_ids = input_ids.long()
681
+
682
+ if kwargs is not None:
683
+ max_new_tokens = kwargs.pop("max_new_tokens", None)
684
+ if max_new_tokens is not None:
685
+ max_length = max_new_tokens + input_ids.shape[1]
686
+
687
+ do_sample = kwargs.pop("do_sample", True)
688
+ if not do_sample:
689
+ top_k, top_p, min_p = 1, 0.0, 0.0
690
+
691
+ cg = kwargs.pop("cg", True)
692
+
693
+ eos_token_id = kwargs.pop("eos_token_id", self.config.eos_token_id)
694
+ if eos_token_id is not None:
695
+ if isinstance(eos_token_id, (list, tuple)):
696
+ eos_token_id = torch.tensor(
697
+ eos_token_id, dtype=torch.long, device=device
698
+ )
699
+ else:
700
+ eos_token_id = torch.tensor(
701
+ [eos_token_id], dtype=torch.long, device=device
702
+ )
703
+
704
+ attention_mask = kwargs.pop("attention_mask", None)
705
+ pad_token_id = kwargs.pop(
706
+ "pad_token_id", getattr(self.config, "pad_token_id", None)
707
+ )
708
+
709
+ # Handle attention_mask by filtering input_ids if provided
710
+ # MambaGenerationMixin doesn't support attention_mask, so we filter instead
711
+ if attention_mask is not None:
712
+ seq_lengths = attention_mask.sum(dim=1)
713
+ max_seq_len = seq_lengths.max().item()
714
+ min_seq_len = seq_lengths.min().item()
715
+ original_seq_len = input_ids.shape[1]
716
+
717
+ if min_seq_len == max_seq_len and max_seq_len <= original_seq_len:
718
+ input_ids = input_ids[:, :max_seq_len].contiguous()
719
+ else:
720
+ batch_size = input_ids.shape[0]
721
+ dtype = input_ids.dtype
722
+ pad_value = pad_token_id if pad_token_id is not None else 0
723
+
724
+ input_ids_filtered = torch.full(
725
+ (batch_size, max_seq_len), pad_value, dtype=dtype, device=device
726
+ )
727
+
728
+ copy_len = min(max_seq_len, original_seq_len)
729
+ if copy_len > 0:
730
+ valid_mask = torch.arange(copy_len, device=device).unsqueeze(
731
+ 0
732
+ ) < seq_lengths.unsqueeze(1)
733
+ input_ids_slice = input_ids[:, :copy_len].contiguous()
734
+ input_ids_filtered_slice = input_ids_filtered[:, :copy_len]
735
+ input_ids_filtered[:, :copy_len] = torch.where(
736
+ valid_mask, input_ids_slice, input_ids_filtered_slice
737
+ )
738
+
739
+ input_ids = input_ids_filtered.contiguous()
740
+
741
+ repetition_penalty = kwargs.pop("repetition_penalty", repetition_penalty)
742
+
743
+ # Remove unsupported kwargs
744
+ for key in [
745
+ "use_cache",
746
+ "no_repeat_ngram_size",
747
+ "length_penalty",
748
+ "num_return_sequences",
749
+ "num_beams",
750
+ "low_memory",
751
+ "stopping_criteria",
752
+ ]:
753
+ kwargs.pop(key, None)
754
+
755
+ return super().generate(
756
+ input_ids=input_ids,
757
+ max_length=max_length,
758
+ cg=cg,
759
+ top_k=top_k,
760
+ top_p=top_p,
761
+ min_p=min_p,
762
+ temperature=temperature,
763
+ repetition_penalty=repetition_penalty,
764
+ return_dict_in_generate=return_dict_in_generate,
765
+ output_scores=output_scores,
766
+ eos_token_id=eos_token_id,
767
+ **kwargs,
768
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|eot_id|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|eot_id|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
tokenizer_config.json ADDED
@@ -0,0 +1,2064 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "128000": {
4
+ "content": "<|begin_of_text|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "128001": {
12
+ "content": "<|end_of_text|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "128002": {
20
+ "content": "<|reserved_special_token_0|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "128003": {
28
+ "content": "<|reserved_special_token_1|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128004": {
36
+ "content": "<|finetune_right_pad_id|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "128005": {
44
+ "content": "<|reserved_special_token_2|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "128006": {
52
+ "content": "<|start_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "128007": {
60
+ "content": "<|end_header_id|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "128008": {
68
+ "content": "<|eom_id|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "128009": {
76
+ "content": "<|eot_id|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "128010": {
84
+ "content": "<|python_tag|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "128011": {
92
+ "content": "<|reserved_special_token_3|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "128012": {
100
+ "content": "<|reserved_special_token_4|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "128013": {
108
+ "content": "<|reserved_special_token_5|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "128014": {
116
+ "content": "<|reserved_special_token_6|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "128015": {
124
+ "content": "<|reserved_special_token_7|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "128016": {
132
+ "content": "<|reserved_special_token_8|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "128017": {
140
+ "content": "<|reserved_special_token_9|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "128018": {
148
+ "content": "<|reserved_special_token_10|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "128019": {
156
+ "content": "<|reserved_special_token_11|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "128020": {
164
+ "content": "<|reserved_special_token_12|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "128021": {
172
+ "content": "<|reserved_special_token_13|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "128022": {
180
+ "content": "<|reserved_special_token_14|>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "128023": {
188
+ "content": "<|reserved_special_token_15|>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "128024": {
196
+ "content": "<|reserved_special_token_16|>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "128025": {
204
+ "content": "<|reserved_special_token_17|>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "128026": {
212
+ "content": "<|reserved_special_token_18|>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "128027": {
220
+ "content": "<|reserved_special_token_19|>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "128028": {
228
+ "content": "<|reserved_special_token_20|>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "128029": {
236
+ "content": "<|reserved_special_token_21|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "128030": {
244
+ "content": "<|reserved_special_token_22|>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "128031": {
252
+ "content": "<|reserved_special_token_23|>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "128032": {
260
+ "content": "<|reserved_special_token_24|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "128033": {
268
+ "content": "<|reserved_special_token_25|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "128034": {
276
+ "content": "<|reserved_special_token_26|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "128035": {
284
+ "content": "<|reserved_special_token_27|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "128036": {
292
+ "content": "<|reserved_special_token_28|>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "128037": {
300
+ "content": "<|reserved_special_token_29|>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "128038": {
308
+ "content": "<|reserved_special_token_30|>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "128039": {
316
+ "content": "<|reserved_special_token_31|>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "128040": {
324
+ "content": "<|reserved_special_token_32|>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "128041": {
332
+ "content": "<|reserved_special_token_33|>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "128042": {
340
+ "content": "<|reserved_special_token_34|>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "128043": {
348
+ "content": "<|reserved_special_token_35|>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "128044": {
356
+ "content": "<|reserved_special_token_36|>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "128045": {
364
+ "content": "<|reserved_special_token_37|>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "128046": {
372
+ "content": "<|reserved_special_token_38|>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "128047": {
380
+ "content": "<|reserved_special_token_39|>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "128048": {
388
+ "content": "<|reserved_special_token_40|>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "128049": {
396
+ "content": "<|reserved_special_token_41|>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "128050": {
404
+ "content": "<|reserved_special_token_42|>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "128051": {
412
+ "content": "<|reserved_special_token_43|>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "128052": {
420
+ "content": "<|reserved_special_token_44|>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "128053": {
428
+ "content": "<|reserved_special_token_45|>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "128054": {
436
+ "content": "<|reserved_special_token_46|>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "128055": {
444
+ "content": "<|reserved_special_token_47|>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "128056": {
452
+ "content": "<|reserved_special_token_48|>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "128057": {
460
+ "content": "<|reserved_special_token_49|>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "128058": {
468
+ "content": "<|reserved_special_token_50|>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "128059": {
476
+ "content": "<|reserved_special_token_51|>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "128060": {
484
+ "content": "<|reserved_special_token_52|>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "128061": {
492
+ "content": "<|reserved_special_token_53|>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "128062": {
500
+ "content": "<|reserved_special_token_54|>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "128063": {
508
+ "content": "<|reserved_special_token_55|>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "128064": {
516
+ "content": "<|reserved_special_token_56|>",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "128065": {
524
+ "content": "<|reserved_special_token_57|>",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "128066": {
532
+ "content": "<|reserved_special_token_58|>",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "128067": {
540
+ "content": "<|reserved_special_token_59|>",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "128068": {
548
+ "content": "<|reserved_special_token_60|>",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "128069": {
556
+ "content": "<|reserved_special_token_61|>",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "128070": {
564
+ "content": "<|reserved_special_token_62|>",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "128071": {
572
+ "content": "<|reserved_special_token_63|>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "128072": {
580
+ "content": "<|reserved_special_token_64|>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "128073": {
588
+ "content": "<|reserved_special_token_65|>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "128074": {
596
+ "content": "<|reserved_special_token_66|>",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "128075": {
604
+ "content": "<|reserved_special_token_67|>",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "128076": {
612
+ "content": "<|reserved_special_token_68|>",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "128077": {
620
+ "content": "<|reserved_special_token_69|>",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "128078": {
628
+ "content": "<|reserved_special_token_70|>",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "128079": {
636
+ "content": "<|reserved_special_token_71|>",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "128080": {
644
+ "content": "<|reserved_special_token_72|>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "128081": {
652
+ "content": "<|reserved_special_token_73|>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "128082": {
660
+ "content": "<|reserved_special_token_74|>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "128083": {
668
+ "content": "<|reserved_special_token_75|>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "128084": {
676
+ "content": "<|reserved_special_token_76|>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "128085": {
684
+ "content": "<|reserved_special_token_77|>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "128086": {
692
+ "content": "<|reserved_special_token_78|>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "128087": {
700
+ "content": "<|reserved_special_token_79|>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "128088": {
708
+ "content": "<|reserved_special_token_80|>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "128089": {
716
+ "content": "<|reserved_special_token_81|>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "128090": {
724
+ "content": "<|reserved_special_token_82|>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "128091": {
732
+ "content": "<|reserved_special_token_83|>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "128092": {
740
+ "content": "<|reserved_special_token_84|>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "128093": {
748
+ "content": "<|reserved_special_token_85|>",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "128094": {
756
+ "content": "<|reserved_special_token_86|>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "128095": {
764
+ "content": "<|reserved_special_token_87|>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "128096": {
772
+ "content": "<|reserved_special_token_88|>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "128097": {
780
+ "content": "<|reserved_special_token_89|>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "128098": {
788
+ "content": "<|reserved_special_token_90|>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "128099": {
796
+ "content": "<|reserved_special_token_91|>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "128100": {
804
+ "content": "<|reserved_special_token_92|>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "128101": {
812
+ "content": "<|reserved_special_token_93|>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "128102": {
820
+ "content": "<|reserved_special_token_94|>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "128103": {
828
+ "content": "<|reserved_special_token_95|>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "128104": {
836
+ "content": "<|reserved_special_token_96|>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "128105": {
844
+ "content": "<|reserved_special_token_97|>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "128106": {
852
+ "content": "<|reserved_special_token_98|>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "128107": {
860
+ "content": "<|reserved_special_token_99|>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "128108": {
868
+ "content": "<|reserved_special_token_100|>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "128109": {
876
+ "content": "<|reserved_special_token_101|>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "128110": {
884
+ "content": "<|reserved_special_token_102|>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "128111": {
892
+ "content": "<|reserved_special_token_103|>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "128112": {
900
+ "content": "<|reserved_special_token_104|>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "128113": {
908
+ "content": "<|reserved_special_token_105|>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "128114": {
916
+ "content": "<|reserved_special_token_106|>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "128115": {
924
+ "content": "<|reserved_special_token_107|>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "128116": {
932
+ "content": "<|reserved_special_token_108|>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "128117": {
940
+ "content": "<|reserved_special_token_109|>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "128118": {
948
+ "content": "<|reserved_special_token_110|>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "128119": {
956
+ "content": "<|reserved_special_token_111|>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "128120": {
964
+ "content": "<|reserved_special_token_112|>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "128121": {
972
+ "content": "<|reserved_special_token_113|>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "128122": {
980
+ "content": "<|reserved_special_token_114|>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "128123": {
988
+ "content": "<|reserved_special_token_115|>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "128124": {
996
+ "content": "<|reserved_special_token_116|>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "128125": {
1004
+ "content": "<|reserved_special_token_117|>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "128126": {
1012
+ "content": "<|reserved_special_token_118|>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "128127": {
1020
+ "content": "<|reserved_special_token_119|>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
+ },
1027
+ "128128": {
1028
+ "content": "<|reserved_special_token_120|>",
1029
+ "lstrip": false,
1030
+ "normalized": false,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": true
1034
+ },
1035
+ "128129": {
1036
+ "content": "<|reserved_special_token_121|>",
1037
+ "lstrip": false,
1038
+ "normalized": false,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": true
1042
+ },
1043
+ "128130": {
1044
+ "content": "<|reserved_special_token_122|>",
1045
+ "lstrip": false,
1046
+ "normalized": false,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": true
1050
+ },
1051
+ "128131": {
1052
+ "content": "<|reserved_special_token_123|>",
1053
+ "lstrip": false,
1054
+ "normalized": false,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": true
1058
+ },
1059
+ "128132": {
1060
+ "content": "<|reserved_special_token_124|>",
1061
+ "lstrip": false,
1062
+ "normalized": false,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": true
1066
+ },
1067
+ "128133": {
1068
+ "content": "<|reserved_special_token_125|>",
1069
+ "lstrip": false,
1070
+ "normalized": false,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": true
1074
+ },
1075
+ "128134": {
1076
+ "content": "<|reserved_special_token_126|>",
1077
+ "lstrip": false,
1078
+ "normalized": false,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": true
1082
+ },
1083
+ "128135": {
1084
+ "content": "<|reserved_special_token_127|>",
1085
+ "lstrip": false,
1086
+ "normalized": false,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": true
1090
+ },
1091
+ "128136": {
1092
+ "content": "<|reserved_special_token_128|>",
1093
+ "lstrip": false,
1094
+ "normalized": false,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": true
1098
+ },
1099
+ "128137": {
1100
+ "content": "<|reserved_special_token_129|>",
1101
+ "lstrip": false,
1102
+ "normalized": false,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": true
1106
+ },
1107
+ "128138": {
1108
+ "content": "<|reserved_special_token_130|>",
1109
+ "lstrip": false,
1110
+ "normalized": false,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": true
1114
+ },
1115
+ "128139": {
1116
+ "content": "<|reserved_special_token_131|>",
1117
+ "lstrip": false,
1118
+ "normalized": false,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": true
1122
+ },
1123
+ "128140": {
1124
+ "content": "<|reserved_special_token_132|>",
1125
+ "lstrip": false,
1126
+ "normalized": false,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": true
1130
+ },
1131
+ "128141": {
1132
+ "content": "<|reserved_special_token_133|>",
1133
+ "lstrip": false,
1134
+ "normalized": false,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": true
1138
+ },
1139
+ "128142": {
1140
+ "content": "<|reserved_special_token_134|>",
1141
+ "lstrip": false,
1142
+ "normalized": false,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": true
1146
+ },
1147
+ "128143": {
1148
+ "content": "<|reserved_special_token_135|>",
1149
+ "lstrip": false,
1150
+ "normalized": false,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": true
1154
+ },
1155
+ "128144": {
1156
+ "content": "<|reserved_special_token_136|>",
1157
+ "lstrip": false,
1158
+ "normalized": false,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": true
1162
+ },
1163
+ "128145": {
1164
+ "content": "<|reserved_special_token_137|>",
1165
+ "lstrip": false,
1166
+ "normalized": false,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": true
1170
+ },
1171
+ "128146": {
1172
+ "content": "<|reserved_special_token_138|>",
1173
+ "lstrip": false,
1174
+ "normalized": false,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": true
1178
+ },
1179
+ "128147": {
1180
+ "content": "<|reserved_special_token_139|>",
1181
+ "lstrip": false,
1182
+ "normalized": false,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": true
1186
+ },
1187
+ "128148": {
1188
+ "content": "<|reserved_special_token_140|>",
1189
+ "lstrip": false,
1190
+ "normalized": false,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": true
1194
+ },
1195
+ "128149": {
1196
+ "content": "<|reserved_special_token_141|>",
1197
+ "lstrip": false,
1198
+ "normalized": false,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": true
1202
+ },
1203
+ "128150": {
1204
+ "content": "<|reserved_special_token_142|>",
1205
+ "lstrip": false,
1206
+ "normalized": false,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": true
1210
+ },
1211
+ "128151": {
1212
+ "content": "<|reserved_special_token_143|>",
1213
+ "lstrip": false,
1214
+ "normalized": false,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": true
1218
+ },
1219
+ "128152": {
1220
+ "content": "<|reserved_special_token_144|>",
1221
+ "lstrip": false,
1222
+ "normalized": false,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": true
1226
+ },
1227
+ "128153": {
1228
+ "content": "<|reserved_special_token_145|>",
1229
+ "lstrip": false,
1230
+ "normalized": false,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": true
1234
+ },
1235
+ "128154": {
1236
+ "content": "<|reserved_special_token_146|>",
1237
+ "lstrip": false,
1238
+ "normalized": false,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": true
1242
+ },
1243
+ "128155": {
1244
+ "content": "<|reserved_special_token_147|>",
1245
+ "lstrip": false,
1246
+ "normalized": false,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": true
1250
+ },
1251
+ "128156": {
1252
+ "content": "<|reserved_special_token_148|>",
1253
+ "lstrip": false,
1254
+ "normalized": false,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": true
1258
+ },
1259
+ "128157": {
1260
+ "content": "<|reserved_special_token_149|>",
1261
+ "lstrip": false,
1262
+ "normalized": false,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": true
1266
+ },
1267
+ "128158": {
1268
+ "content": "<|reserved_special_token_150|>",
1269
+ "lstrip": false,
1270
+ "normalized": false,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": true
1274
+ },
1275
+ "128159": {
1276
+ "content": "<|reserved_special_token_151|>",
1277
+ "lstrip": false,
1278
+ "normalized": false,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": true
1282
+ },
1283
+ "128160": {
1284
+ "content": "<|reserved_special_token_152|>",
1285
+ "lstrip": false,
1286
+ "normalized": false,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": true
1290
+ },
1291
+ "128161": {
1292
+ "content": "<|reserved_special_token_153|>",
1293
+ "lstrip": false,
1294
+ "normalized": false,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": true
1298
+ },
1299
+ "128162": {
1300
+ "content": "<|reserved_special_token_154|>",
1301
+ "lstrip": false,
1302
+ "normalized": false,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": true
1306
+ },
1307
+ "128163": {
1308
+ "content": "<|reserved_special_token_155|>",
1309
+ "lstrip": false,
1310
+ "normalized": false,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": true
1314
+ },
1315
+ "128164": {
1316
+ "content": "<|reserved_special_token_156|>",
1317
+ "lstrip": false,
1318
+ "normalized": false,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": true
1322
+ },
1323
+ "128165": {
1324
+ "content": "<|reserved_special_token_157|>",
1325
+ "lstrip": false,
1326
+ "normalized": false,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": true
1330
+ },
1331
+ "128166": {
1332
+ "content": "<|reserved_special_token_158|>",
1333
+ "lstrip": false,
1334
+ "normalized": false,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": true
1338
+ },
1339
+ "128167": {
1340
+ "content": "<|reserved_special_token_159|>",
1341
+ "lstrip": false,
1342
+ "normalized": false,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": true
1346
+ },
1347
+ "128168": {
1348
+ "content": "<|reserved_special_token_160|>",
1349
+ "lstrip": false,
1350
+ "normalized": false,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": true
1354
+ },
1355
+ "128169": {
1356
+ "content": "<|reserved_special_token_161|>",
1357
+ "lstrip": false,
1358
+ "normalized": false,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": true
1362
+ },
1363
+ "128170": {
1364
+ "content": "<|reserved_special_token_162|>",
1365
+ "lstrip": false,
1366
+ "normalized": false,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": true
1370
+ },
1371
+ "128171": {
1372
+ "content": "<|reserved_special_token_163|>",
1373
+ "lstrip": false,
1374
+ "normalized": false,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": true
1378
+ },
1379
+ "128172": {
1380
+ "content": "<|reserved_special_token_164|>",
1381
+ "lstrip": false,
1382
+ "normalized": false,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": true
1386
+ },
1387
+ "128173": {
1388
+ "content": "<|reserved_special_token_165|>",
1389
+ "lstrip": false,
1390
+ "normalized": false,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": true
1394
+ },
1395
+ "128174": {
1396
+ "content": "<|reserved_special_token_166|>",
1397
+ "lstrip": false,
1398
+ "normalized": false,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": true
1402
+ },
1403
+ "128175": {
1404
+ "content": "<|reserved_special_token_167|>",
1405
+ "lstrip": false,
1406
+ "normalized": false,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": true
1410
+ },
1411
+ "128176": {
1412
+ "content": "<|reserved_special_token_168|>",
1413
+ "lstrip": false,
1414
+ "normalized": false,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": true
1418
+ },
1419
+ "128177": {
1420
+ "content": "<|reserved_special_token_169|>",
1421
+ "lstrip": false,
1422
+ "normalized": false,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": true
1426
+ },
1427
+ "128178": {
1428
+ "content": "<|reserved_special_token_170|>",
1429
+ "lstrip": false,
1430
+ "normalized": false,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": true
1434
+ },
1435
+ "128179": {
1436
+ "content": "<|reserved_special_token_171|>",
1437
+ "lstrip": false,
1438
+ "normalized": false,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": true
1442
+ },
1443
+ "128180": {
1444
+ "content": "<|reserved_special_token_172|>",
1445
+ "lstrip": false,
1446
+ "normalized": false,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": true
1450
+ },
1451
+ "128181": {
1452
+ "content": "<|reserved_special_token_173|>",
1453
+ "lstrip": false,
1454
+ "normalized": false,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": true
1458
+ },
1459
+ "128182": {
1460
+ "content": "<|reserved_special_token_174|>",
1461
+ "lstrip": false,
1462
+ "normalized": false,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": true
1466
+ },
1467
+ "128183": {
1468
+ "content": "<|reserved_special_token_175|>",
1469
+ "lstrip": false,
1470
+ "normalized": false,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": true
1474
+ },
1475
+ "128184": {
1476
+ "content": "<|reserved_special_token_176|>",
1477
+ "lstrip": false,
1478
+ "normalized": false,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": true
1482
+ },
1483
+ "128185": {
1484
+ "content": "<|reserved_special_token_177|>",
1485
+ "lstrip": false,
1486
+ "normalized": false,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": true
1490
+ },
1491
+ "128186": {
1492
+ "content": "<|reserved_special_token_178|>",
1493
+ "lstrip": false,
1494
+ "normalized": false,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": true
1498
+ },
1499
+ "128187": {
1500
+ "content": "<|reserved_special_token_179|>",
1501
+ "lstrip": false,
1502
+ "normalized": false,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": true
1506
+ },
1507
+ "128188": {
1508
+ "content": "<|reserved_special_token_180|>",
1509
+ "lstrip": false,
1510
+ "normalized": false,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": true
1514
+ },
1515
+ "128189": {
1516
+ "content": "<|reserved_special_token_181|>",
1517
+ "lstrip": false,
1518
+ "normalized": false,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": true
1522
+ },
1523
+ "128190": {
1524
+ "content": "<|reserved_special_token_182|>",
1525
+ "lstrip": false,
1526
+ "normalized": false,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": true
1530
+ },
1531
+ "128191": {
1532
+ "content": "<|reserved_special_token_183|>",
1533
+ "lstrip": false,
1534
+ "normalized": false,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": true
1538
+ },
1539
+ "128192": {
1540
+ "content": "<|reserved_special_token_184|>",
1541
+ "lstrip": false,
1542
+ "normalized": false,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": true
1546
+ },
1547
+ "128193": {
1548
+ "content": "<|reserved_special_token_185|>",
1549
+ "lstrip": false,
1550
+ "normalized": false,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": true
1554
+ },
1555
+ "128194": {
1556
+ "content": "<|reserved_special_token_186|>",
1557
+ "lstrip": false,
1558
+ "normalized": false,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": true
1562
+ },
1563
+ "128195": {
1564
+ "content": "<|reserved_special_token_187|>",
1565
+ "lstrip": false,
1566
+ "normalized": false,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": true
1570
+ },
1571
+ "128196": {
1572
+ "content": "<|reserved_special_token_188|>",
1573
+ "lstrip": false,
1574
+ "normalized": false,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": true
1578
+ },
1579
+ "128197": {
1580
+ "content": "<|reserved_special_token_189|>",
1581
+ "lstrip": false,
1582
+ "normalized": false,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": true
1586
+ },
1587
+ "128198": {
1588
+ "content": "<|reserved_special_token_190|>",
1589
+ "lstrip": false,
1590
+ "normalized": false,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": true
1594
+ },
1595
+ "128199": {
1596
+ "content": "<|reserved_special_token_191|>",
1597
+ "lstrip": false,
1598
+ "normalized": false,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": true
1602
+ },
1603
+ "128200": {
1604
+ "content": "<|reserved_special_token_192|>",
1605
+ "lstrip": false,
1606
+ "normalized": false,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": true
1610
+ },
1611
+ "128201": {
1612
+ "content": "<|reserved_special_token_193|>",
1613
+ "lstrip": false,
1614
+ "normalized": false,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": true
1618
+ },
1619
+ "128202": {
1620
+ "content": "<|reserved_special_token_194|>",
1621
+ "lstrip": false,
1622
+ "normalized": false,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": true
1626
+ },
1627
+ "128203": {
1628
+ "content": "<|reserved_special_token_195|>",
1629
+ "lstrip": false,
1630
+ "normalized": false,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": true
1634
+ },
1635
+ "128204": {
1636
+ "content": "<|reserved_special_token_196|>",
1637
+ "lstrip": false,
1638
+ "normalized": false,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": true
1642
+ },
1643
+ "128205": {
1644
+ "content": "<|reserved_special_token_197|>",
1645
+ "lstrip": false,
1646
+ "normalized": false,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": true
1650
+ },
1651
+ "128206": {
1652
+ "content": "<|reserved_special_token_198|>",
1653
+ "lstrip": false,
1654
+ "normalized": false,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": true
1658
+ },
1659
+ "128207": {
1660
+ "content": "<|reserved_special_token_199|>",
1661
+ "lstrip": false,
1662
+ "normalized": false,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": true
1666
+ },
1667
+ "128208": {
1668
+ "content": "<|reserved_special_token_200|>",
1669
+ "lstrip": false,
1670
+ "normalized": false,
1671
+ "rstrip": false,
1672
+ "single_word": false,
1673
+ "special": true
1674
+ },
1675
+ "128209": {
1676
+ "content": "<|reserved_special_token_201|>",
1677
+ "lstrip": false,
1678
+ "normalized": false,
1679
+ "rstrip": false,
1680
+ "single_word": false,
1681
+ "special": true
1682
+ },
1683
+ "128210": {
1684
+ "content": "<|reserved_special_token_202|>",
1685
+ "lstrip": false,
1686
+ "normalized": false,
1687
+ "rstrip": false,
1688
+ "single_word": false,
1689
+ "special": true
1690
+ },
1691
+ "128211": {
1692
+ "content": "<|reserved_special_token_203|>",
1693
+ "lstrip": false,
1694
+ "normalized": false,
1695
+ "rstrip": false,
1696
+ "single_word": false,
1697
+ "special": true
1698
+ },
1699
+ "128212": {
1700
+ "content": "<|reserved_special_token_204|>",
1701
+ "lstrip": false,
1702
+ "normalized": false,
1703
+ "rstrip": false,
1704
+ "single_word": false,
1705
+ "special": true
1706
+ },
1707
+ "128213": {
1708
+ "content": "<|reserved_special_token_205|>",
1709
+ "lstrip": false,
1710
+ "normalized": false,
1711
+ "rstrip": false,
1712
+ "single_word": false,
1713
+ "special": true
1714
+ },
1715
+ "128214": {
1716
+ "content": "<|reserved_special_token_206|>",
1717
+ "lstrip": false,
1718
+ "normalized": false,
1719
+ "rstrip": false,
1720
+ "single_word": false,
1721
+ "special": true
1722
+ },
1723
+ "128215": {
1724
+ "content": "<|reserved_special_token_207|>",
1725
+ "lstrip": false,
1726
+ "normalized": false,
1727
+ "rstrip": false,
1728
+ "single_word": false,
1729
+ "special": true
1730
+ },
1731
+ "128216": {
1732
+ "content": "<|reserved_special_token_208|>",
1733
+ "lstrip": false,
1734
+ "normalized": false,
1735
+ "rstrip": false,
1736
+ "single_word": false,
1737
+ "special": true
1738
+ },
1739
+ "128217": {
1740
+ "content": "<|reserved_special_token_209|>",
1741
+ "lstrip": false,
1742
+ "normalized": false,
1743
+ "rstrip": false,
1744
+ "single_word": false,
1745
+ "special": true
1746
+ },
1747
+ "128218": {
1748
+ "content": "<|reserved_special_token_210|>",
1749
+ "lstrip": false,
1750
+ "normalized": false,
1751
+ "rstrip": false,
1752
+ "single_word": false,
1753
+ "special": true
1754
+ },
1755
+ "128219": {
1756
+ "content": "<|reserved_special_token_211|>",
1757
+ "lstrip": false,
1758
+ "normalized": false,
1759
+ "rstrip": false,
1760
+ "single_word": false,
1761
+ "special": true
1762
+ },
1763
+ "128220": {
1764
+ "content": "<|reserved_special_token_212|>",
1765
+ "lstrip": false,
1766
+ "normalized": false,
1767
+ "rstrip": false,
1768
+ "single_word": false,
1769
+ "special": true
1770
+ },
1771
+ "128221": {
1772
+ "content": "<|reserved_special_token_213|>",
1773
+ "lstrip": false,
1774
+ "normalized": false,
1775
+ "rstrip": false,
1776
+ "single_word": false,
1777
+ "special": true
1778
+ },
1779
+ "128222": {
1780
+ "content": "<|reserved_special_token_214|>",
1781
+ "lstrip": false,
1782
+ "normalized": false,
1783
+ "rstrip": false,
1784
+ "single_word": false,
1785
+ "special": true
1786
+ },
1787
+ "128223": {
1788
+ "content": "<|reserved_special_token_215|>",
1789
+ "lstrip": false,
1790
+ "normalized": false,
1791
+ "rstrip": false,
1792
+ "single_word": false,
1793
+ "special": true
1794
+ },
1795
+ "128224": {
1796
+ "content": "<|reserved_special_token_216|>",
1797
+ "lstrip": false,
1798
+ "normalized": false,
1799
+ "rstrip": false,
1800
+ "single_word": false,
1801
+ "special": true
1802
+ },
1803
+ "128225": {
1804
+ "content": "<|reserved_special_token_217|>",
1805
+ "lstrip": false,
1806
+ "normalized": false,
1807
+ "rstrip": false,
1808
+ "single_word": false,
1809
+ "special": true
1810
+ },
1811
+ "128226": {
1812
+ "content": "<|reserved_special_token_218|>",
1813
+ "lstrip": false,
1814
+ "normalized": false,
1815
+ "rstrip": false,
1816
+ "single_word": false,
1817
+ "special": true
1818
+ },
1819
+ "128227": {
1820
+ "content": "<|reserved_special_token_219|>",
1821
+ "lstrip": false,
1822
+ "normalized": false,
1823
+ "rstrip": false,
1824
+ "single_word": false,
1825
+ "special": true
1826
+ },
1827
+ "128228": {
1828
+ "content": "<|reserved_special_token_220|>",
1829
+ "lstrip": false,
1830
+ "normalized": false,
1831
+ "rstrip": false,
1832
+ "single_word": false,
1833
+ "special": true
1834
+ },
1835
+ "128229": {
1836
+ "content": "<|reserved_special_token_221|>",
1837
+ "lstrip": false,
1838
+ "normalized": false,
1839
+ "rstrip": false,
1840
+ "single_word": false,
1841
+ "special": true
1842
+ },
1843
+ "128230": {
1844
+ "content": "<|reserved_special_token_222|>",
1845
+ "lstrip": false,
1846
+ "normalized": false,
1847
+ "rstrip": false,
1848
+ "single_word": false,
1849
+ "special": true
1850
+ },
1851
+ "128231": {
1852
+ "content": "<|reserved_special_token_223|>",
1853
+ "lstrip": false,
1854
+ "normalized": false,
1855
+ "rstrip": false,
1856
+ "single_word": false,
1857
+ "special": true
1858
+ },
1859
+ "128232": {
1860
+ "content": "<|reserved_special_token_224|>",
1861
+ "lstrip": false,
1862
+ "normalized": false,
1863
+ "rstrip": false,
1864
+ "single_word": false,
1865
+ "special": true
1866
+ },
1867
+ "128233": {
1868
+ "content": "<|reserved_special_token_225|>",
1869
+ "lstrip": false,
1870
+ "normalized": false,
1871
+ "rstrip": false,
1872
+ "single_word": false,
1873
+ "special": true
1874
+ },
1875
+ "128234": {
1876
+ "content": "<|reserved_special_token_226|>",
1877
+ "lstrip": false,
1878
+ "normalized": false,
1879
+ "rstrip": false,
1880
+ "single_word": false,
1881
+ "special": true
1882
+ },
1883
+ "128235": {
1884
+ "content": "<|reserved_special_token_227|>",
1885
+ "lstrip": false,
1886
+ "normalized": false,
1887
+ "rstrip": false,
1888
+ "single_word": false,
1889
+ "special": true
1890
+ },
1891
+ "128236": {
1892
+ "content": "<|reserved_special_token_228|>",
1893
+ "lstrip": false,
1894
+ "normalized": false,
1895
+ "rstrip": false,
1896
+ "single_word": false,
1897
+ "special": true
1898
+ },
1899
+ "128237": {
1900
+ "content": "<|reserved_special_token_229|>",
1901
+ "lstrip": false,
1902
+ "normalized": false,
1903
+ "rstrip": false,
1904
+ "single_word": false,
1905
+ "special": true
1906
+ },
1907
+ "128238": {
1908
+ "content": "<|reserved_special_token_230|>",
1909
+ "lstrip": false,
1910
+ "normalized": false,
1911
+ "rstrip": false,
1912
+ "single_word": false,
1913
+ "special": true
1914
+ },
1915
+ "128239": {
1916
+ "content": "<|reserved_special_token_231|>",
1917
+ "lstrip": false,
1918
+ "normalized": false,
1919
+ "rstrip": false,
1920
+ "single_word": false,
1921
+ "special": true
1922
+ },
1923
+ "128240": {
1924
+ "content": "<|reserved_special_token_232|>",
1925
+ "lstrip": false,
1926
+ "normalized": false,
1927
+ "rstrip": false,
1928
+ "single_word": false,
1929
+ "special": true
1930
+ },
1931
+ "128241": {
1932
+ "content": "<|reserved_special_token_233|>",
1933
+ "lstrip": false,
1934
+ "normalized": false,
1935
+ "rstrip": false,
1936
+ "single_word": false,
1937
+ "special": true
1938
+ },
1939
+ "128242": {
1940
+ "content": "<|reserved_special_token_234|>",
1941
+ "lstrip": false,
1942
+ "normalized": false,
1943
+ "rstrip": false,
1944
+ "single_word": false,
1945
+ "special": true
1946
+ },
1947
+ "128243": {
1948
+ "content": "<|reserved_special_token_235|>",
1949
+ "lstrip": false,
1950
+ "normalized": false,
1951
+ "rstrip": false,
1952
+ "single_word": false,
1953
+ "special": true
1954
+ },
1955
+ "128244": {
1956
+ "content": "<|reserved_special_token_236|>",
1957
+ "lstrip": false,
1958
+ "normalized": false,
1959
+ "rstrip": false,
1960
+ "single_word": false,
1961
+ "special": true
1962
+ },
1963
+ "128245": {
1964
+ "content": "<|reserved_special_token_237|>",
1965
+ "lstrip": false,
1966
+ "normalized": false,
1967
+ "rstrip": false,
1968
+ "single_word": false,
1969
+ "special": true
1970
+ },
1971
+ "128246": {
1972
+ "content": "<|reserved_special_token_238|>",
1973
+ "lstrip": false,
1974
+ "normalized": false,
1975
+ "rstrip": false,
1976
+ "single_word": false,
1977
+ "special": true
1978
+ },
1979
+ "128247": {
1980
+ "content": "<|reserved_special_token_239|>",
1981
+ "lstrip": false,
1982
+ "normalized": false,
1983
+ "rstrip": false,
1984
+ "single_word": false,
1985
+ "special": true
1986
+ },
1987
+ "128248": {
1988
+ "content": "<|reserved_special_token_240|>",
1989
+ "lstrip": false,
1990
+ "normalized": false,
1991
+ "rstrip": false,
1992
+ "single_word": false,
1993
+ "special": true
1994
+ },
1995
+ "128249": {
1996
+ "content": "<|reserved_special_token_241|>",
1997
+ "lstrip": false,
1998
+ "normalized": false,
1999
+ "rstrip": false,
2000
+ "single_word": false,
2001
+ "special": true
2002
+ },
2003
+ "128250": {
2004
+ "content": "<|reserved_special_token_242|>",
2005
+ "lstrip": false,
2006
+ "normalized": false,
2007
+ "rstrip": false,
2008
+ "single_word": false,
2009
+ "special": true
2010
+ },
2011
+ "128251": {
2012
+ "content": "<|reserved_special_token_243|>",
2013
+ "lstrip": false,
2014
+ "normalized": false,
2015
+ "rstrip": false,
2016
+ "single_word": false,
2017
+ "special": true
2018
+ },
2019
+ "128252": {
2020
+ "content": "<|reserved_special_token_244|>",
2021
+ "lstrip": false,
2022
+ "normalized": false,
2023
+ "rstrip": false,
2024
+ "single_word": false,
2025
+ "special": true
2026
+ },
2027
+ "128253": {
2028
+ "content": "<|reserved_special_token_245|>",
2029
+ "lstrip": false,
2030
+ "normalized": false,
2031
+ "rstrip": false,
2032
+ "single_word": false,
2033
+ "special": true
2034
+ },
2035
+ "128254": {
2036
+ "content": "<|reserved_special_token_246|>",
2037
+ "lstrip": false,
2038
+ "normalized": false,
2039
+ "rstrip": false,
2040
+ "single_word": false,
2041
+ "special": true
2042
+ },
2043
+ "128255": {
2044
+ "content": "<|reserved_special_token_247|>",
2045
+ "lstrip": false,
2046
+ "normalized": false,
2047
+ "rstrip": false,
2048
+ "single_word": false,
2049
+ "special": true
2050
+ }
2051
+ },
2052
+ "bos_token": "<|begin_of_text|>",
2053
+ "clean_up_tokenization_spaces": true,
2054
+ "eos_token": "<|eot_id|>",
2055
+ "extra_special_tokens": {},
2056
+ "fix_mistral_regex": true,
2057
+ "model_input_names": [
2058
+ "input_ids",
2059
+ "attention_mask"
2060
+ ],
2061
+ "model_max_length": 8192,
2062
+ "pad_token": "<|eot_id|>",
2063
+ "tokenizer_class": "PreTrainedTokenizerFast"
2064
+ }