.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>{%- endif -%}
17
+ {%- macro visible_text(content) -%}
18
+ {%- if content is string -%}
19
+ {{- content }}
20
+ {%- elif content is iterable and content is not mapping -%}
21
+ {%- for item in content -%}
22
+ {%- if item is mapping and item.type == 'text' -%}
23
+ {{- item.text }}
24
+ {%- elif item is string -%}
25
+ {{- item }}
26
+ {%- endif -%}
27
+ {%- endfor -%}
28
+ {%- else -%}
29
+ {{- content }}
30
+ {%- endif -%}
31
+ {%- endmacro -%}
32
+ {%- set ns = namespace(last_user_index=-1) %}
33
+ {%- for m in messages %}
34
+ {%- if m.role == 'user' %}
35
+ {%- set ns.last_user_index = loop.index0 -%}
36
+ {%- endif %}
37
+ {%- endfor %}
38
+ {%- for m in messages -%}
39
+ {%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}
40
+ {%- elif m.role == 'assistant' -%}
41
+ <|assistant|>
42
+ {%- set reasoning_content = '' %}
43
+ {%- set content = visible_text(m.content) %}
44
+ {%- if m.reasoning_content is string %}
45
+ {%- set reasoning_content = m.reasoning_content %}
46
+ {%- else %}
47
+ {%- if '</think>' in content %}
48
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
49
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
50
+ {%- endif %}
51
+ {%- endif %}
52
+ {%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%}
53
+ {{ '<think>' + reasoning_content.strip() + '</think>'}}
54
+ {%- else -%}
55
+ {{ '</think>' }}
56
+ {%- endif -%}
57
+ {%- if content.strip() -%}
58
+ {{ content.strip() }}
59
+ {%- endif -%}
60
+ {% if m.tool_calls %}
61
+ {% for tc in m.tool_calls %}
62
+ {%- if tc.function %}
63
+ {%- set tc = tc.function %}
64
+ {%- endif %}
65
+ {{- '<tool_call>' + tc.name -}}
66
+ {% set _args = tc.arguments %}{% for k, v in _args.items() %}<arg_key>{{ k }}</arg_key><arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>{% endfor %}</tool_call>{% endfor %}
67
+ {% endif %}
68
+ {%- elif m.role == 'tool' -%}
69
+ {%- if m.content is string -%}
70
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
71
+ {{- '<|observation|>' }}
72
+ {%- endif %}
73
+ {{- '<tool_response>' }}
74
+ {{- m.content }}
75
+ {{- '</tool_response>' }}
76
+ {%- else -%}
77
+ <|observation|>{% for tr in m.content %}
78
+ <tool_response>{{ tr.output if tr.output is defined else tr }}</tool_response>{% endfor -%}
79
+ {% endif -%}
80
+ {%- elif m.role == 'system' -%}
81
+ <|system|>{{ visible_text(m.content) }}
82
+ {%- endif -%}
83
+ {%- endfor -%}
84
+ {%- if add_generation_prompt -%}
85
+ <|assistant|>{{- '</think>' if (enable_thinking is defined and not enable_thinking) else '<think>' -}}
86
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GlmMoeDsaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": [
10
+ 154820,
11
+ 154827,
12
+ 154829
13
+ ],
14
+ "ep_size": 1,
15
+ "first_k_dense_replace": 3,
16
+ "head_dim": 64,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 6144,
19
+ "index_head_dim": 128,
20
+ "index_n_heads": 32,
21
+ "index_topk": 2048,
22
+ "indexer_rope_interleave": true,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 12288,
25
+ "kv_lora_rank": 512,
26
+ "max_position_embeddings": 202752,
27
+ "mlp_layer_types": [
28
+ "dense",
29
+ "dense",
30
+ "dense",
31
+ "sparse",
32
+ "sparse",
33
+ "sparse",
34
+ "sparse",
35
+ "sparse",
36
+ "sparse",
37
+ "sparse",
38
+ "sparse",
39
+ "sparse",
40
+ "sparse",
41
+ "sparse",
42
+ "sparse",
43
+ "sparse",
44
+ "sparse",
45
+ "sparse",
46
+ "sparse",
47
+ "sparse",
48
+ "sparse",
49
+ "sparse",
50
+ "sparse",
51
+ "sparse",
52
+ "sparse",
53
+ "sparse",
54
+ "sparse",
55
+ "sparse",
56
+ "sparse",
57
+ "sparse",
58
+ "sparse",
59
+ "sparse",
60
+ "sparse",
61
+ "sparse",
62
+ "sparse",
63
+ "sparse",
64
+ "sparse",
65
+ "sparse",
66
+ "sparse",
67
+ "sparse",
68
+ "sparse",
69
+ "sparse",
70
+ "sparse",
71
+ "sparse",
72
+ "sparse",
73
+ "sparse",
74
+ "sparse",
75
+ "sparse",
76
+ "sparse",
77
+ "sparse",
78
+ "sparse",
79
+ "sparse",
80
+ "sparse",
81
+ "sparse",
82
+ "sparse",
83
+ "sparse",
84
+ "sparse",
85
+ "sparse",
86
+ "sparse",
87
+ "sparse",
88
+ "sparse",
89
+ "sparse",
90
+ "sparse",
91
+ "sparse",
92
+ "sparse",
93
+ "sparse",
94
+ "sparse",
95
+ "sparse",
96
+ "sparse",
97
+ "sparse",
98
+ "sparse",
99
+ "sparse",
100
+ "sparse",
101
+ "sparse",
102
+ "sparse",
103
+ "sparse",
104
+ "sparse",
105
+ "sparse"
106
+ ],
107
+ "model_type": "glm_moe_dsa",
108
+ "moe_intermediate_size": 2048,
109
+ "moe_layer_freq": 1,
110
+ "n_group": 1,
111
+ "n_routed_experts": 256,
112
+ "n_shared_experts": 1,
113
+ "norm_topk_prob": true,
114
+ "num_attention_heads": 64,
115
+ "num_experts_per_tok": 8,
116
+ "num_hidden_layers": 78,
117
+ "num_key_value_heads": 64,
118
+ "num_nextn_predict_layers": 1,
119
+ "pad_token_id": 154820,
120
+ "pretraining_tp": 1,
121
+ "q_lora_rank": 2048,
122
+ "qk_head_dim": 256,
123
+ "qk_nope_head_dim": 192,
124
+ "qk_rope_head_dim": 64,
125
+ "quantization_config": {
126
+ "algo_config": null,
127
+ "exclude": [
128
+ "model.layers.0.self_attn.q_a_proj",
129
+ "model.layers.0.self_attn.q_b_proj",
130
+ "model.layers.0.self_attn.kv_a_proj_with_mqa",
131
+ "model.layers.0.self_attn.kv_b_proj",
132
+ "model.layers.0.self_attn.o_proj",
133
+ "model.layers.0.self_attn.indexer.wq_b",
134
+ "model.layers.0.self_attn.indexer.wk",
135
+ "model.layers.0.self_attn.indexer.weights_proj",
136
+ "model.layers.0.mlp.gate_proj",
137
+ "model.layers.0.mlp.up_proj",
138
+ "model.layers.0.mlp.down_proj",
139
+ "model.layers.1.self_attn.q_a_proj",
140
+ "model.layers.1.self_attn.q_b_proj",
141
+ "model.layers.1.self_attn.kv_a_proj_with_mqa",
142
+ "model.layers.1.self_attn.kv_b_proj",
143
+ "model.layers.1.self_attn.o_proj",
144
+ "model.layers.1.self_attn.indexer.wq_b",
145
+ "model.layers.1.self_attn.indexer.wk",
146
+ "model.layers.1.self_attn.indexer.weights_proj",
147
+ "model.layers.1.mlp.gate_proj",
148
+ "model.layers.1.mlp.up_proj",
149
+ "model.layers.1.mlp.down_proj",
150
+ "model.layers.2.self_attn.q_a_proj",
151
+ "model.layers.2.self_attn.q_b_proj",
152
+ "model.layers.2.self_attn.kv_a_proj_with_mqa",
153
+ "model.layers.2.self_attn.kv_b_proj",
154
+ "model.layers.2.self_attn.o_proj",
155
+ "model.layers.2.self_attn.indexer.wq_b",
156
+ "model.layers.2.self_attn.indexer.wk",
157
+ "model.layers.2.self_attn.indexer.weights_proj",
158
+ "model.layers.2.mlp.gate_proj",
159
+ "model.layers.2.mlp.up_proj",
160
+ "model.layers.2.mlp.down_proj",
161
+ "model.layers.3.self_attn.q_a_proj",
162
+ "model.layers.3.self_attn.q_b_proj",
163
+ "model.layers.3.self_attn.kv_a_proj_with_mqa",
164
+ "model.layers.3.self_attn.kv_b_proj",
165
+ "model.layers.3.self_attn.o_proj",
166
+ "model.layers.3.self_attn.indexer.wq_b",
167
+ "model.layers.3.self_attn.indexer.wk",
168
+ "model.layers.3.self_attn.indexer.weights_proj",
169
+ "model.layers.4.self_attn.q_a_proj",
170
+ "model.layers.4.self_attn.q_b_proj",
171
+ "model.layers.4.self_attn.kv_a_proj_with_mqa",
172
+ "model.layers.4.self_attn.kv_b_proj",
173
+ "model.layers.4.self_attn.o_proj",
174
+ "model.layers.4.self_attn.indexer.wq_b",
175
+ "model.layers.4.self_attn.indexer.wk",
176
+ "model.layers.4.self_attn.indexer.weights_proj",
177
+ "model.layers.5.self_attn.q_a_proj",
178
+ "model.layers.5.self_attn.q_b_proj",
179
+ "model.layers.5.self_attn.kv_a_proj_with_mqa",
180
+ "model.layers.5.self_attn.kv_b_proj",
181
+ "model.layers.5.self_attn.o_proj",
182
+ "model.layers.5.self_attn.indexer.wq_b",
183
+ "model.layers.5.self_attn.indexer.wk",
184
+ "model.layers.5.self_attn.indexer.weights_proj",
185
+ "model.layers.6.self_attn.q_a_proj",
186
+ "model.layers.6.self_attn.q_b_proj",
187
+ "model.layers.6.self_attn.kv_a_proj_with_mqa",
188
+ "model.layers.6.self_attn.kv_b_proj",
189
+ "model.layers.6.self_attn.o_proj",
190
+ "model.layers.6.self_attn.indexer.wq_b",
191
+ "model.layers.6.self_attn.indexer.wk",
192
+ "model.layers.6.self_attn.indexer.weights_proj",
193
+ "model.layers.7.self_attn.q_a_proj",
194
+ "model.layers.7.self_attn.q_b_proj",
195
+ "model.layers.7.self_attn.kv_a_proj_with_mqa",
196
+ "model.layers.7.self_attn.kv_b_proj",
197
+ "model.layers.7.self_attn.o_proj",
198
+ "model.layers.7.self_attn.indexer.wq_b",
199
+ "model.layers.7.self_attn.indexer.wk",
200
+ "model.layers.7.self_attn.indexer.weights_proj",
201
+ "model.layers.8.self_attn.q_a_proj",
202
+ "model.layers.8.self_attn.q_b_proj",
203
+ "model.layers.8.self_attn.kv_a_proj_with_mqa",
204
+ "model.layers.8.self_attn.kv_b_proj",
205
+ "model.layers.8.self_attn.o_proj",
206
+ "model.layers.8.self_attn.indexer.wq_b",
207
+ "model.layers.8.self_attn.indexer.wk",
208
+ "model.layers.8.self_attn.indexer.weights_proj",
209
+ "model.layers.9.self_attn.q_a_proj",
210
+ "model.layers.9.self_attn.q_b_proj",
211
+ "model.layers.9.self_attn.kv_a_proj_with_mqa",
212
+ "model.layers.9.self_attn.kv_b_proj",
213
+ "model.layers.9.self_attn.o_proj",
214
+ "model.layers.9.self_attn.indexer.wq_b",
215
+ "model.layers.9.self_attn.indexer.wk",
216
+ "model.layers.9.self_attn.indexer.weights_proj",
217
+ "model.layers.10.self_attn.q_a_proj",
218
+ "model.layers.10.self_attn.q_b_proj",
219
+ "model.layers.10.self_attn.kv_a_proj_with_mqa",
220
+ "model.layers.10.self_attn.kv_b_proj",
221
+ "model.layers.10.self_attn.o_proj",
222
+ "model.layers.10.self_attn.indexer.wq_b",
223
+ "model.layers.10.self_attn.indexer.wk",
224
+ "model.layers.10.self_attn.indexer.weights_proj",
225
+ "model.layers.11.self_attn.q_a_proj",
226
+ "model.layers.11.self_attn.q_b_proj",
227
+ "model.layers.11.self_attn.kv_a_proj_with_mqa",
228
+ "model.layers.11.self_attn.kv_b_proj",
229
+ "model.layers.11.self_attn.o_proj",
230
+ "model.layers.11.self_attn.indexer.wq_b",
231
+ "model.layers.11.self_attn.indexer.wk",
232
+ "model.layers.11.self_attn.indexer.weights_proj",
233
+ "model.layers.12.self_attn.q_a_proj",
234
+ "model.layers.12.self_attn.q_b_proj",
235
+ "model.layers.12.self_attn.kv_a_proj_with_mqa",
236
+ "model.layers.12.self_attn.kv_b_proj",
237
+ "model.layers.12.self_attn.o_proj",
238
+ "model.layers.12.self_attn.indexer.wq_b",
239
+ "model.layers.12.self_attn.indexer.wk",
240
+ "model.layers.12.self_attn.indexer.weights_proj",
241
+ "model.layers.13.self_attn.q_a_proj",
242
+ "model.layers.13.self_attn.q_b_proj",
243
+ "model.layers.13.self_attn.kv_a_proj_with_mqa",
244
+ "model.layers.13.self_attn.kv_b_proj",
245
+ "model.layers.13.self_attn.o_proj",
246
+ "model.layers.13.self_attn.indexer.wq_b",
247
+ "model.layers.13.self_attn.indexer.wk",
248
+ "model.layers.13.self_attn.indexer.weights_proj",
249
+ "model.layers.14.self_attn.q_a_proj",
250
+ "model.layers.14.self_attn.q_b_proj",
251
+ "model.layers.14.self_attn.kv_a_proj_with_mqa",
252
+ "model.layers.14.self_attn.kv_b_proj",
253
+ "model.layers.14.self_attn.o_proj",
254
+ "model.layers.14.self_attn.indexer.wq_b",
255
+ "model.layers.14.self_attn.indexer.wk",
256
+ "model.layers.14.self_attn.indexer.weights_proj",
257
+ "model.layers.15.self_attn.q_a_proj",
258
+ "model.layers.15.self_attn.q_b_proj",
259
+ "model.layers.15.self_attn.kv_a_proj_with_mqa",
260
+ "model.layers.15.self_attn.kv_b_proj",
261
+ "model.layers.15.self_attn.o_proj",
262
+ "model.layers.15.self_attn.indexer.wq_b",
263
+ "model.layers.15.self_attn.indexer.wk",
264
+ "model.layers.15.self_attn.indexer.weights_proj",
265
+ "model.layers.16.self_attn.q_a_proj",
266
+ "model.layers.16.self_attn.q_b_proj",
267
+ "model.layers.16.self_attn.kv_a_proj_with_mqa",
268
+ "model.layers.16.self_attn.kv_b_proj",
269
+ "model.layers.16.self_attn.o_proj",
270
+ "model.layers.16.self_attn.indexer.wq_b",
271
+ "model.layers.16.self_attn.indexer.wk",
272
+ "model.layers.16.self_attn.indexer.weights_proj",
273
+ "model.layers.17.self_attn.q_a_proj",
274
+ "model.layers.17.self_attn.q_b_proj",
275
+ "model.layers.17.self_attn.kv_a_proj_with_mqa",
276
+ "model.layers.17.self_attn.kv_b_proj",
277
+ "model.layers.17.self_attn.o_proj",
278
+ "model.layers.17.self_attn.indexer.wq_b",
279
+ "model.layers.17.self_attn.indexer.wk",
280
+ "model.layers.17.self_attn.indexer.weights_proj",
281
+ "model.layers.18.self_attn.q_a_proj",
282
+ "model.layers.18.self_attn.q_b_proj",
283
+ "model.layers.18.self_attn.kv_a_proj_with_mqa",
284
+ "model.layers.18.self_attn.kv_b_proj",
285
+ "model.layers.18.self_attn.o_proj",
286
+ "model.layers.18.self_attn.indexer.wq_b",
287
+ "model.layers.18.self_attn.indexer.wk",
288
+ "model.layers.18.self_attn.indexer.weights_proj",
289
+ "model.layers.19.self_attn.q_a_proj",
290
+ "model.layers.19.self_attn.q_b_proj",
291
+ "model.layers.19.self_attn.kv_a_proj_with_mqa",
292
+ "model.layers.19.self_attn.kv_b_proj",
293
+ "model.layers.19.self_attn.o_proj",
294
+ "model.layers.19.self_attn.indexer.wq_b",
295
+ "model.layers.19.self_attn.indexer.wk",
296
+ "model.layers.19.self_attn.indexer.weights_proj",
297
+ "model.layers.20.self_attn.q_a_proj",
298
+ "model.layers.20.self_attn.q_b_proj",
299
+ "model.layers.20.self_attn.kv_a_proj_with_mqa",
300
+ "model.layers.20.self_attn.kv_b_proj",
301
+ "model.layers.20.self_attn.o_proj",
302
+ "model.layers.20.self_attn.indexer.wq_b",
303
+ "model.layers.20.self_attn.indexer.wk",
304
+ "model.layers.20.self_attn.indexer.weights_proj",
305
+ "model.layers.21.self_attn.q_a_proj",
306
+ "model.layers.21.self_attn.q_b_proj",
307
+ "model.layers.21.self_attn.kv_a_proj_with_mqa",
308
+ "model.layers.21.self_attn.kv_b_proj",
309
+ "model.layers.21.self_attn.o_proj",
310
+ "model.layers.21.self_attn.indexer.wq_b",
311
+ "model.layers.21.self_attn.indexer.wk",
312
+ "model.layers.21.self_attn.indexer.weights_proj",
313
+ "model.layers.22.self_attn.q_a_proj",
314
+ "model.layers.22.self_attn.q_b_proj",
315
+ "model.layers.22.self_attn.kv_a_proj_with_mqa",
316
+ "model.layers.22.self_attn.kv_b_proj",
317
+ "model.layers.22.self_attn.o_proj",
318
+ "model.layers.22.self_attn.indexer.wq_b",
319
+ "model.layers.22.self_attn.indexer.wk",
320
+ "model.layers.22.self_attn.indexer.weights_proj",
321
+ "model.layers.23.self_attn.q_a_proj",
322
+ "model.layers.23.self_attn.q_b_proj",
323
+ "model.layers.23.self_attn.kv_a_proj_with_mqa",
324
+ "model.layers.23.self_attn.kv_b_proj",
325
+ "model.layers.23.self_attn.o_proj",
326
+ "model.layers.23.self_attn.indexer.wq_b",
327
+ "model.layers.23.self_attn.indexer.wk",
328
+ "model.layers.23.self_attn.indexer.weights_proj",
329
+ "model.layers.24.self_attn.q_a_proj",
330
+ "model.layers.24.self_attn.q_b_proj",
331
+ "model.layers.24.self_attn.kv_a_proj_with_mqa",
332
+ "model.layers.24.self_attn.kv_b_proj",
333
+ "model.layers.24.self_attn.o_proj",
334
+ "model.layers.24.self_attn.indexer.wq_b",
335
+ "model.layers.24.self_attn.indexer.wk",
336
+ "model.layers.24.self_attn.indexer.weights_proj",
337
+ "model.layers.25.self_attn.q_a_proj",
338
+ "model.layers.25.self_attn.q_b_proj",
339
+ "model.layers.25.self_attn.kv_a_proj_with_mqa",
340
+ "model.layers.25.self_attn.kv_b_proj",
341
+ "model.layers.25.self_attn.o_proj",
342
+ "model.layers.25.self_attn.indexer.wq_b",
343
+ "model.layers.25.self_attn.indexer.wk",
344
+ "model.layers.25.self_attn.indexer.weights_proj",
345
+ "model.layers.26.self_attn.q_a_proj",
346
+ "model.layers.26.self_attn.q_b_proj",
347
+ "model.layers.26.self_attn.kv_a_proj_with_mqa",
348
+ "model.layers.26.self_attn.kv_b_proj",
349
+ "model.layers.26.self_attn.o_proj",
350
+ "model.layers.26.self_attn.indexer.wq_b",
351
+ "model.layers.26.self_attn.indexer.wk",
352
+ "model.layers.26.self_attn.indexer.weights_proj",
353
+ "model.layers.27.self_attn.q_a_proj",
354
+ "model.layers.27.self_attn.q_b_proj",
355
+ "model.layers.27.self_attn.kv_a_proj_with_mqa",
356
+ "model.layers.27.self_attn.kv_b_proj",
357
+ "model.layers.27.self_attn.o_proj",
358
+ "model.layers.27.self_attn.indexer.wq_b",
359
+ "model.layers.27.self_attn.indexer.wk",
360
+ "model.layers.27.self_attn.indexer.weights_proj",
361
+ "model.layers.28.self_attn.q_a_proj",
362
+ "model.layers.28.self_attn.q_b_proj",
363
+ "model.layers.28.self_attn.kv_a_proj_with_mqa",
364
+ "model.layers.28.self_attn.kv_b_proj",
365
+ "model.layers.28.self_attn.o_proj",
366
+ "model.layers.28.self_attn.indexer.wq_b",
367
+ "model.layers.28.self_attn.indexer.wk",
368
+ "model.layers.28.self_attn.indexer.weights_proj",
369
+ "model.layers.29.self_attn.q_a_proj",
370
+ "model.layers.29.self_attn.q_b_proj",
371
+ "model.layers.29.self_attn.kv_a_proj_with_mqa",
372
+ "model.layers.29.self_attn.kv_b_proj",
373
+ "model.layers.29.self_attn.o_proj",
374
+ "model.layers.29.self_attn.indexer.wq_b",
375
+ "model.layers.29.self_attn.indexer.wk",
376
+ "model.layers.29.self_attn.indexer.weights_proj",
377
+ "model.layers.30.self_attn.q_a_proj",
378
+ "model.layers.30.self_attn.q_b_proj",
379
+ "model.layers.30.self_attn.kv_a_proj_with_mqa",
380
+ "model.layers.30.self_attn.kv_b_proj",
381
+ "model.layers.30.self_attn.o_proj",
382
+ "model.layers.30.self_attn.indexer.wq_b",
383
+ "model.layers.30.self_attn.indexer.wk",
384
+ "model.layers.30.self_attn.indexer.weights_proj",
385
+ "model.layers.31.self_attn.q_a_proj",
386
+ "model.layers.31.self_attn.q_b_proj",
387
+ "model.layers.31.self_attn.kv_a_proj_with_mqa",
388
+ "model.layers.31.self_attn.kv_b_proj",
389
+ "model.layers.31.self_attn.o_proj",
390
+ "model.layers.31.self_attn.indexer.wq_b",
391
+ "model.layers.31.self_attn.indexer.wk",
392
+ "model.layers.31.self_attn.indexer.weights_proj",
393
+ "model.layers.32.self_attn.q_a_proj",
394
+ "model.layers.32.self_attn.q_b_proj",
395
+ "model.layers.32.self_attn.kv_a_proj_with_mqa",
396
+ "model.layers.32.self_attn.kv_b_proj",
397
+ "model.layers.32.self_attn.o_proj",
398
+ "model.layers.32.self_attn.indexer.wq_b",
399
+ "model.layers.32.self_attn.indexer.wk",
400
+ "model.layers.32.self_attn.indexer.weights_proj",
401
+ "model.layers.33.self_attn.q_a_proj",
402
+ "model.layers.33.self_attn.q_b_proj",
403
+ "model.layers.33.self_attn.kv_a_proj_with_mqa",
404
+ "model.layers.33.self_attn.kv_b_proj",
405
+ "model.layers.33.self_attn.o_proj",
406
+ "model.layers.33.self_attn.indexer.wq_b",
407
+ "model.layers.33.self_attn.indexer.wk",
408
+ "model.layers.33.self_attn.indexer.weights_proj",
409
+ "model.layers.34.self_attn.q_a_proj",
410
+ "model.layers.34.self_attn.q_b_proj",
411
+ "model.layers.34.self_attn.kv_a_proj_with_mqa",
412
+ "model.layers.34.self_attn.kv_b_proj",
413
+ "model.layers.34.self_attn.o_proj",
414
+ "model.layers.34.self_attn.indexer.wq_b",
415
+ "model.layers.34.self_attn.indexer.wk",
416
+ "model.layers.34.self_attn.indexer.weights_proj",
417
+ "model.layers.35.self_attn.q_a_proj",
418
+ "model.layers.35.self_attn.q_b_proj",
419
+ "model.layers.35.self_attn.kv_a_proj_with_mqa",
420
+ "model.layers.35.self_attn.kv_b_proj",
421
+ "model.layers.35.self_attn.o_proj",
422
+ "model.layers.35.self_attn.indexer.wq_b",
423
+ "model.layers.35.self_attn.indexer.wk",
424
+ "model.layers.35.self_attn.indexer.weights_proj",
425
+ "model.layers.36.self_attn.q_a_proj",
426
+ "model.layers.36.self_attn.q_b_proj",
427
+ "model.layers.36.self_attn.kv_a_proj_with_mqa",
428
+ "model.layers.36.self_attn.kv_b_proj",
429
+ "model.layers.36.self_attn.o_proj",
430
+ "model.layers.36.self_attn.indexer.wq_b",
431
+ "model.layers.36.self_attn.indexer.wk",
432
+ "model.layers.36.self_attn.indexer.weights_proj",
433
+ "model.layers.37.self_attn.q_a_proj",
434
+ "model.layers.37.self_attn.q_b_proj",
435
+ "model.layers.37.self_attn.kv_a_proj_with_mqa",
436
+ "model.layers.37.self_attn.kv_b_proj",
437
+ "model.layers.37.self_attn.o_proj",
438
+ "model.layers.37.self_attn.indexer.wq_b",
439
+ "model.layers.37.self_attn.indexer.wk",
440
+ "model.layers.37.self_attn.indexer.weights_proj",
441
+ "model.layers.38.self_attn.q_a_proj",
442
+ "model.layers.38.self_attn.q_b_proj",
443
+ "model.layers.38.self_attn.kv_a_proj_with_mqa",
444
+ "model.layers.38.self_attn.kv_b_proj",
445
+ "model.layers.38.self_attn.o_proj",
446
+ "model.layers.38.self_attn.indexer.wq_b",
447
+ "model.layers.38.self_attn.indexer.wk",
448
+ "model.layers.38.self_attn.indexer.weights_proj",
449
+ "model.layers.39.self_attn.q_a_proj",
450
+ "model.layers.39.self_attn.q_b_proj",
451
+ "model.layers.39.self_attn.kv_a_proj_with_mqa",
452
+ "model.layers.39.self_attn.kv_b_proj",
453
+ "model.layers.39.self_attn.o_proj",
454
+ "model.layers.39.self_attn.indexer.wq_b",
455
+ "model.layers.39.self_attn.indexer.wk",
456
+ "model.layers.39.self_attn.indexer.weights_proj",
457
+ "model.layers.40.self_attn.q_a_proj",
458
+ "model.layers.40.self_attn.q_b_proj",
459
+ "model.layers.40.self_attn.kv_a_proj_with_mqa",
460
+ "model.layers.40.self_attn.kv_b_proj",
461
+ "model.layers.40.self_attn.o_proj",
462
+ "model.layers.40.self_attn.indexer.wq_b",
463
+ "model.layers.40.self_attn.indexer.wk",
464
+ "model.layers.40.self_attn.indexer.weights_proj",
465
+ "model.layers.41.self_attn.q_a_proj",
466
+ "model.layers.41.self_attn.q_b_proj",
467
+ "model.layers.41.self_attn.kv_a_proj_with_mqa",
468
+ "model.layers.41.self_attn.kv_b_proj",
469
+ "model.layers.41.self_attn.o_proj",
470
+ "model.layers.41.self_attn.indexer.wq_b",
471
+ "model.layers.41.self_attn.indexer.wk",
472
+ "model.layers.41.self_attn.indexer.weights_proj",
473
+ "model.layers.42.self_attn.q_a_proj",
474
+ "model.layers.42.self_attn.q_b_proj",
475
+ "model.layers.42.self_attn.kv_a_proj_with_mqa",
476
+ "model.layers.42.self_attn.kv_b_proj",
477
+ "model.layers.42.self_attn.o_proj",
478
+ "model.layers.42.self_attn.indexer.wq_b",
479
+ "model.layers.42.self_attn.indexer.wk",
480
+ "model.layers.42.self_attn.indexer.weights_proj",
481
+ "model.layers.43.self_attn.q_a_proj",
482
+ "model.layers.43.self_attn.q_b_proj",
483
+ "model.layers.43.self_attn.kv_a_proj_with_mqa",
484
+ "model.layers.43.self_attn.kv_b_proj",
485
+ "model.layers.43.self_attn.o_proj",
486
+ "model.layers.43.self_attn.indexer.wq_b",
487
+ "model.layers.43.self_attn.indexer.wk",
488
+ "model.layers.43.self_attn.indexer.weights_proj",
489
+ "model.layers.44.self_attn.q_a_proj",
490
+ "model.layers.44.self_attn.q_b_proj",
491
+ "model.layers.44.self_attn.kv_a_proj_with_mqa",
492
+ "model.layers.44.self_attn.kv_b_proj",
493
+ "model.layers.44.self_attn.o_proj",
494
+ "model.layers.44.self_attn.indexer.wq_b",
495
+ "model.layers.44.self_attn.indexer.wk",
496
+ "model.layers.44.self_attn.indexer.weights_proj",
497
+ "model.layers.45.self_attn.q_a_proj",
498
+ "model.layers.45.self_attn.q_b_proj",
499
+ "model.layers.45.self_attn.kv_a_proj_with_mqa",
500
+ "model.layers.45.self_attn.kv_b_proj",
501
+ "model.layers.45.self_attn.o_proj",
502
+ "model.layers.45.self_attn.indexer.wq_b",
503
+ "model.layers.45.self_attn.indexer.wk",
504
+ "model.layers.45.self_attn.indexer.weights_proj",
505
+ "model.layers.46.self_attn.q_a_proj",
506
+ "model.layers.46.self_attn.q_b_proj",
507
+ "model.layers.46.self_attn.kv_a_proj_with_mqa",
508
+ "model.layers.46.self_attn.kv_b_proj",
509
+ "model.layers.46.self_attn.o_proj",
510
+ "model.layers.46.self_attn.indexer.wq_b",
511
+ "model.layers.46.self_attn.indexer.wk",
512
+ "model.layers.46.self_attn.indexer.weights_proj",
513
+ "model.layers.47.self_attn.q_a_proj",
514
+ "model.layers.47.self_attn.q_b_proj",
515
+ "model.layers.47.self_attn.kv_a_proj_with_mqa",
516
+ "model.layers.47.self_attn.kv_b_proj",
517
+ "model.layers.47.self_attn.o_proj",
518
+ "model.layers.47.self_attn.indexer.wq_b",
519
+ "model.layers.47.self_attn.indexer.wk",
520
+ "model.layers.47.self_attn.indexer.weights_proj",
521
+ "model.layers.48.self_attn.q_a_proj",
522
+ "model.layers.48.self_attn.q_b_proj",
523
+ "model.layers.48.self_attn.kv_a_proj_with_mqa",
524
+ "model.layers.48.self_attn.kv_b_proj",
525
+ "model.layers.48.self_attn.o_proj",
526
+ "model.layers.48.self_attn.indexer.wq_b",
527
+ "model.layers.48.self_attn.indexer.wk",
528
+ "model.layers.48.self_attn.indexer.weights_proj",
529
+ "model.layers.49.self_attn.q_a_proj",
530
+ "model.layers.49.self_attn.q_b_proj",
531
+ "model.layers.49.self_attn.kv_a_proj_with_mqa",
532
+ "model.layers.49.self_attn.kv_b_proj",
533
+ "model.layers.49.self_attn.o_proj",
534
+ "model.layers.49.self_attn.indexer.wq_b",
535
+ "model.layers.49.self_attn.indexer.wk",
536
+ "model.layers.49.self_attn.indexer.weights_proj",
537
+ "model.layers.50.self_attn.q_a_proj",
538
+ "model.layers.50.self_attn.q_b_proj",
539
+ "model.layers.50.self_attn.kv_a_proj_with_mqa",
540
+ "model.layers.50.self_attn.kv_b_proj",
541
+ "model.layers.50.self_attn.o_proj",
542
+ "model.layers.50.self_attn.indexer.wq_b",
543
+ "model.layers.50.self_attn.indexer.wk",
544
+ "model.layers.50.self_attn.indexer.weights_proj",
545
+ "model.layers.51.self_attn.q_a_proj",
546
+ "model.layers.51.self_attn.q_b_proj",
547
+ "model.layers.51.self_attn.kv_a_proj_with_mqa",
548
+ "model.layers.51.self_attn.kv_b_proj",
549
+ "model.layers.51.self_attn.o_proj",
550
+ "model.layers.51.self_attn.indexer.wq_b",
551
+ "model.layers.51.self_attn.indexer.wk",
552
+ "model.layers.51.self_attn.indexer.weights_proj",
553
+ "model.layers.52.self_attn.q_a_proj",
554
+ "model.layers.52.self_attn.q_b_proj",
555
+ "model.layers.52.self_attn.kv_a_proj_with_mqa",
556
+ "model.layers.52.self_attn.kv_b_proj",
557
+ "model.layers.52.self_attn.o_proj",
558
+ "model.layers.52.self_attn.indexer.wq_b",
559
+ "model.layers.52.self_attn.indexer.wk",
560
+ "model.layers.52.self_attn.indexer.weights_proj",
561
+ "model.layers.53.self_attn.q_a_proj",
562
+ "model.layers.53.self_attn.q_b_proj",
563
+ "model.layers.53.self_attn.kv_a_proj_with_mqa",
564
+ "model.layers.53.self_attn.kv_b_proj",
565
+ "model.layers.53.self_attn.o_proj",
566
+ "model.layers.53.self_attn.indexer.wq_b",
567
+ "model.layers.53.self_attn.indexer.wk",
568
+ "model.layers.53.self_attn.indexer.weights_proj",
569
+ "model.layers.54.self_attn.q_a_proj",
570
+ "model.layers.54.self_attn.q_b_proj",
571
+ "model.layers.54.self_attn.kv_a_proj_with_mqa",
572
+ "model.layers.54.self_attn.kv_b_proj",
573
+ "model.layers.54.self_attn.o_proj",
574
+ "model.layers.54.self_attn.indexer.wq_b",
575
+ "model.layers.54.self_attn.indexer.wk",
576
+ "model.layers.54.self_attn.indexer.weights_proj",
577
+ "model.layers.55.self_attn.q_a_proj",
578
+ "model.layers.55.self_attn.q_b_proj",
579
+ "model.layers.55.self_attn.kv_a_proj_with_mqa",
580
+ "model.layers.55.self_attn.kv_b_proj",
581
+ "model.layers.55.self_attn.o_proj",
582
+ "model.layers.55.self_attn.indexer.wq_b",
583
+ "model.layers.55.self_attn.indexer.wk",
584
+ "model.layers.55.self_attn.indexer.weights_proj",
585
+ "model.layers.56.self_attn.q_a_proj",
586
+ "model.layers.56.self_attn.q_b_proj",
587
+ "model.layers.56.self_attn.kv_a_proj_with_mqa",
588
+ "model.layers.56.self_attn.kv_b_proj",
589
+ "model.layers.56.self_attn.o_proj",
590
+ "model.layers.56.self_attn.indexer.wq_b",
591
+ "model.layers.56.self_attn.indexer.wk",
592
+ "model.layers.56.self_attn.indexer.weights_proj",
593
+ "model.layers.57.self_attn.q_a_proj",
594
+ "model.layers.57.self_attn.q_b_proj",
595
+ "model.layers.57.self_attn.kv_a_proj_with_mqa",
596
+ "model.layers.57.self_attn.kv_b_proj",
597
+ "model.layers.57.self_attn.o_proj",
598
+ "model.layers.57.self_attn.indexer.wq_b",
599
+ "model.layers.57.self_attn.indexer.wk",
600
+ "model.layers.57.self_attn.indexer.weights_proj",
601
+ "model.layers.58.self_attn.q_a_proj",
602
+ "model.layers.58.self_attn.q_b_proj",
603
+ "model.layers.58.self_attn.kv_a_proj_with_mqa",
604
+ "model.layers.58.self_attn.kv_b_proj",
605
+ "model.layers.58.self_attn.o_proj",
606
+ "model.layers.58.self_attn.indexer.wq_b",
607
+ "model.layers.58.self_attn.indexer.wk",
608
+ "model.layers.58.self_attn.indexer.weights_proj",
609
+ "model.layers.59.self_attn.q_a_proj",
610
+ "model.layers.59.self_attn.q_b_proj",
611
+ "model.layers.59.self_attn.kv_a_proj_with_mqa",
612
+ "model.layers.59.self_attn.kv_b_proj",
613
+ "model.layers.59.self_attn.o_proj",
614
+ "model.layers.59.self_attn.indexer.wq_b",
615
+ "model.layers.59.self_attn.indexer.wk",
616
+ "model.layers.59.self_attn.indexer.weights_proj",
617
+ "model.layers.60.self_attn.q_a_proj",
618
+ "model.layers.60.self_attn.q_b_proj",
619
+ "model.layers.60.self_attn.kv_a_proj_with_mqa",
620
+ "model.layers.60.self_attn.kv_b_proj",
621
+ "model.layers.60.self_attn.o_proj",
622
+ "model.layers.60.self_attn.indexer.wq_b",
623
+ "model.layers.60.self_attn.indexer.wk",
624
+ "model.layers.60.self_attn.indexer.weights_proj",
625
+ "model.layers.61.self_attn.q_a_proj",
626
+ "model.layers.61.self_attn.q_b_proj",
627
+ "model.layers.61.self_attn.kv_a_proj_with_mqa",
628
+ "model.layers.61.self_attn.kv_b_proj",
629
+ "model.layers.61.self_attn.o_proj",
630
+ "model.layers.61.self_attn.indexer.wq_b",
631
+ "model.layers.61.self_attn.indexer.wk",
632
+ "model.layers.61.self_attn.indexer.weights_proj",
633
+ "model.layers.62.self_attn.q_a_proj",
634
+ "model.layers.62.self_attn.q_b_proj",
635
+ "model.layers.62.self_attn.kv_a_proj_with_mqa",
636
+ "model.layers.62.self_attn.kv_b_proj",
637
+ "model.layers.62.self_attn.o_proj",
638
+ "model.layers.62.self_attn.indexer.wq_b",
639
+ "model.layers.62.self_attn.indexer.wk",
640
+ "model.layers.62.self_attn.indexer.weights_proj",
641
+ "model.layers.63.self_attn.q_a_proj",
642
+ "model.layers.63.self_attn.q_b_proj",
643
+ "model.layers.63.self_attn.kv_a_proj_with_mqa",
644
+ "model.layers.63.self_attn.kv_b_proj",
645
+ "model.layers.63.self_attn.o_proj",
646
+ "model.layers.63.self_attn.indexer.wq_b",
647
+ "model.layers.63.self_attn.indexer.wk",
648
+ "model.layers.63.self_attn.indexer.weights_proj",
649
+ "model.layers.64.self_attn.q_a_proj",
650
+ "model.layers.64.self_attn.q_b_proj",
651
+ "model.layers.64.self_attn.kv_a_proj_with_mqa",
652
+ "model.layers.64.self_attn.kv_b_proj",
653
+ "model.layers.64.self_attn.o_proj",
654
+ "model.layers.64.self_attn.indexer.wq_b",
655
+ "model.layers.64.self_attn.indexer.wk",
656
+ "model.layers.64.self_attn.indexer.weights_proj",
657
+ "model.layers.65.self_attn.q_a_proj",
658
+ "model.layers.65.self_attn.q_b_proj",
659
+ "model.layers.65.self_attn.kv_a_proj_with_mqa",
660
+ "model.layers.65.self_attn.kv_b_proj",
661
+ "model.layers.65.self_attn.o_proj",
662
+ "model.layers.65.self_attn.indexer.wq_b",
663
+ "model.layers.65.self_attn.indexer.wk",
664
+ "model.layers.65.self_attn.indexer.weights_proj",
665
+ "model.layers.66.self_attn.q_a_proj",
666
+ "model.layers.66.self_attn.q_b_proj",
667
+ "model.layers.66.self_attn.kv_a_proj_with_mqa",
668
+ "model.layers.66.self_attn.kv_b_proj",
669
+ "model.layers.66.self_attn.o_proj",
670
+ "model.layers.66.self_attn.indexer.wq_b",
671
+ "model.layers.66.self_attn.indexer.wk",
672
+ "model.layers.66.self_attn.indexer.weights_proj",
673
+ "model.layers.67.self_attn.q_a_proj",
674
+ "model.layers.67.self_attn.q_b_proj",
675
+ "model.layers.67.self_attn.kv_a_proj_with_mqa",
676
+ "model.layers.67.self_attn.kv_b_proj",
677
+ "model.layers.67.self_attn.o_proj",
678
+ "model.layers.67.self_attn.indexer.wq_b",
679
+ "model.layers.67.self_attn.indexer.wk",
680
+ "model.layers.67.self_attn.indexer.weights_proj",
681
+ "model.layers.68.self_attn.q_a_proj",
682
+ "model.layers.68.self_attn.q_b_proj",
683
+ "model.layers.68.self_attn.kv_a_proj_with_mqa",
684
+ "model.layers.68.self_attn.kv_b_proj",
685
+ "model.layers.68.self_attn.o_proj",
686
+ "model.layers.68.self_attn.indexer.wq_b",
687
+ "model.layers.68.self_attn.indexer.wk",
688
+ "model.layers.68.self_attn.indexer.weights_proj",
689
+ "model.layers.69.self_attn.q_a_proj",
690
+ "model.layers.69.self_attn.q_b_proj",
691
+ "model.layers.69.self_attn.kv_a_proj_with_mqa",
692
+ "model.layers.69.self_attn.kv_b_proj",
693
+ "model.layers.69.self_attn.o_proj",
694
+ "model.layers.69.self_attn.indexer.wq_b",
695
+ "model.layers.69.self_attn.indexer.wk",
696
+ "model.layers.69.self_attn.indexer.weights_proj",
697
+ "model.layers.70.self_attn.q_a_proj",
698
+ "model.layers.70.self_attn.q_b_proj",
699
+ "model.layers.70.self_attn.kv_a_proj_with_mqa",
700
+ "model.layers.70.self_attn.kv_b_proj",
701
+ "model.layers.70.self_attn.o_proj",
702
+ "model.layers.70.self_attn.indexer.wq_b",
703
+ "model.layers.70.self_attn.indexer.wk",
704
+ "model.layers.70.self_attn.indexer.weights_proj",
705
+ "model.layers.71.self_attn.q_a_proj",
706
+ "model.layers.71.self_attn.q_b_proj",
707
+ "model.layers.71.self_attn.kv_a_proj_with_mqa",
708
+ "model.layers.71.self_attn.kv_b_proj",
709
+ "model.layers.71.self_attn.o_proj",
710
+ "model.layers.71.self_attn.indexer.wq_b",
711
+ "model.layers.71.self_attn.indexer.wk",
712
+ "model.layers.71.self_attn.indexer.weights_proj",
713
+ "model.layers.72.self_attn.q_a_proj",
714
+ "model.layers.72.self_attn.q_b_proj",
715
+ "model.layers.72.self_attn.kv_a_proj_with_mqa",
716
+ "model.layers.72.self_attn.kv_b_proj",
717
+ "model.layers.72.self_attn.o_proj",
718
+ "model.layers.72.self_attn.indexer.wq_b",
719
+ "model.layers.72.self_attn.indexer.wk",
720
+ "model.layers.72.self_attn.indexer.weights_proj",
721
+ "model.layers.73.self_attn.q_a_proj",
722
+ "model.layers.73.self_attn.q_b_proj",
723
+ "model.layers.73.self_attn.kv_a_proj_with_mqa",
724
+ "model.layers.73.self_attn.kv_b_proj",
725
+ "model.layers.73.self_attn.o_proj",
726
+ "model.layers.73.self_attn.indexer.wq_b",
727
+ "model.layers.73.self_attn.indexer.wk",
728
+ "model.layers.73.self_attn.indexer.weights_proj",
729
+ "model.layers.74.self_attn.q_a_proj",
730
+ "model.layers.74.self_attn.q_b_proj",
731
+ "model.layers.74.self_attn.kv_a_proj_with_mqa",
732
+ "model.layers.74.self_attn.kv_b_proj",
733
+ "model.layers.74.self_attn.o_proj",
734
+ "model.layers.74.self_attn.indexer.wq_b",
735
+ "model.layers.74.self_attn.indexer.wk",
736
+ "model.layers.74.self_attn.indexer.weights_proj",
737
+ "model.layers.75.self_attn.q_a_proj",
738
+ "model.layers.75.self_attn.q_b_proj",
739
+ "model.layers.75.self_attn.kv_a_proj_with_mqa",
740
+ "model.layers.75.self_attn.kv_b_proj",
741
+ "model.layers.75.self_attn.o_proj",
742
+ "model.layers.75.self_attn.indexer.wq_b",
743
+ "model.layers.75.self_attn.indexer.wk",
744
+ "model.layers.75.self_attn.indexer.weights_proj",
745
+ "model.layers.76.self_attn.q_a_proj",
746
+ "model.layers.76.self_attn.q_b_proj",
747
+ "model.layers.76.self_attn.kv_a_proj_with_mqa",
748
+ "model.layers.76.self_attn.kv_b_proj",
749
+ "model.layers.76.self_attn.o_proj",
750
+ "model.layers.76.self_attn.indexer.wq_b",
751
+ "model.layers.76.self_attn.indexer.wk",
752
+ "model.layers.76.self_attn.indexer.weights_proj",
753
+ "model.layers.77.self_attn.q_a_proj",
754
+ "model.layers.77.self_attn.q_b_proj",
755
+ "model.layers.77.self_attn.kv_a_proj_with_mqa",
756
+ "model.layers.77.self_attn.kv_b_proj",
757
+ "model.layers.77.self_attn.o_proj",
758
+ "model.layers.77.self_attn.indexer.wq_b",
759
+ "model.layers.77.self_attn.indexer.wk",
760
+ "model.layers.77.self_attn.indexer.weights_proj",
761
+ "lm_head"
762
+ ],
763
+ "export": {
764
+ "kv_cache_group": [],
765
+ "min_kv_scale": 0.0,
766
+ "pack_method": "reorder",
767
+ "weight_format": "real_quantized",
768
+ "weight_merge_groups": null
769
+ },
770
+ "global_quant_config": {
771
+ "bias": null,
772
+ "input_tensors": [
773
+ {
774
+ "block_size": null,
775
+ "ch_axis": -1,
776
+ "dtype": "fp4",
777
+ "enable_buffer_reuse": false,
778
+ "group_size": 16,
779
+ "is_dynamic": true,
780
+ "is_scale_quant": false,
781
+ "max_input_numel": 4194304,
782
+ "mx_element_dtype": null,
783
+ "observer_cls": "PerBlockMXObserver",
784
+ "qscheme": "per_group",
785
+ "round_method": "half_even",
786
+ "scale_calculation_mode": null,
787
+ "scale_format": "float32",
788
+ "scale_type": "float32",
789
+ "symmetric": null
790
+ },
791
+ {
792
+ "block_size": null,
793
+ "ch_axis": null,
794
+ "dtype": "fp8_e4m3",
795
+ "enable_buffer_reuse": false,
796
+ "group_size": null,
797
+ "is_dynamic": false,
798
+ "is_scale_quant": true,
799
+ "max_input_numel": 4194304,
800
+ "mx_element_dtype": null,
801
+ "observer_cls": "PerTensorMinMaxObserver",
802
+ "qscheme": "per_tensor",
803
+ "round_method": "half_even",
804
+ "scale_calculation_mode": null,
805
+ "scale_format": null,
806
+ "scale_type": "float32",
807
+ "symmetric": true
808
+ }
809
+ ],
810
+ "output_tensors": null,
811
+ "target_device": null,
812
+ "weight": [
813
+ {
814
+ "block_size": null,
815
+ "ch_axis": -1,
816
+ "dtype": "fp4",
817
+ "enable_buffer_reuse": false,
818
+ "group_size": 16,
819
+ "is_dynamic": false,
820
+ "is_scale_quant": false,
821
+ "max_input_numel": 4194304,
822
+ "mx_element_dtype": null,
823
+ "observer_cls": "PerBlockMXObserver",
824
+ "qscheme": "per_group",
825
+ "round_method": "half_even",
826
+ "scale_calculation_mode": null,
827
+ "scale_format": "float32",
828
+ "scale_type": "float32",
829
+ "symmetric": null
830
+ },
831
+ {
832
+ "block_size": null,
833
+ "ch_axis": null,
834
+ "dtype": "fp8_e4m3",
835
+ "enable_buffer_reuse": false,
836
+ "group_size": null,
837
+ "is_dynamic": false,
838
+ "is_scale_quant": true,
839
+ "max_input_numel": 4194304,
840
+ "mx_element_dtype": null,
841
+ "observer_cls": "PerTensorMinMaxObserver",
842
+ "qscheme": "per_tensor",
843
+ "round_method": "half_even",
844
+ "scale_calculation_mode": null,
845
+ "scale_format": null,
846
+ "scale_type": "float32",
847
+ "symmetric": true
848
+ }
849
+ ]
850
+ },
851
+ "kv_cache_post_rope": false,
852
+ "kv_cache_quant_config": {},
853
+ "layer_quant_config": {},
854
+ "layer_type_quant_config": {},
855
+ "quant_method": "quark",
856
+ "quant_mode": "eager_mode",
857
+ "softmax_quant_spec": null,
858
+ "version": "0.12+13caa7b4951"
859
+ },
860
+ "rms_norm_eps": 1e-05,
861
+ "rope_interleave": true,
862
+ "rope_parameters": {
863
+ "rope_theta": 1000000,
864
+ "rope_type": "default"
865
+ },
866
+ "routed_scaling_factor": 2.5,
867
+ "scoring_func": "sigmoid",
868
+ "tie_word_embeddings": false,
869
+ "topk_group": 1,
870
+ "topk_method": "noaux_tc",
871
+ "transformers_version": "5.2.0",
872
+ "use_cache": true,
873
+ "v_head_dim": 256,
874
+ "vocab_size": 154880
875
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 154820,
6
+ 154827,
7
+ 154829
8
+ ],
9
+ "pad_token_id": 154820,
10
+ "temperature": 1.0,
11
+ "top_p": 0.95,
12
+ "transformers_version": "5.2.0"
13
+ }
model-00001-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc0632234f22648aebccf63336cf487a7bf329270a06eeba58a605b7cf7640db
3
+ size 49999051524
model-00002-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3492b34a49cad5b38c6115f36abc9c12d8e6a595f60ce1a43b487bf84ea9bead
3
+ size 49998227584
model-00003-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac5551c6e497c23770f1318f73837091d04997d1c4791b7d31d50c356e864ed6
3
+ size 50000184968
model-00004-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3f2398894d465a1267ec941bd7ac4929ed02f0b39aacaa2f7383cabd6abcda
3
+ size 49998227296
model-00005-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee9c7b9bc6e5cf4204b51c7b19d7b4dd1b4468d0a68c52dfe9cb575ea4b493dd
3
+ size 49916961116
model-00006-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33d0032091aac75d27508fafcec37273275874fe19bfe17fe8da76dfe195e610
3
+ size 50002802316
model-00007-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6d94e29dcd242429dc411039d5897c5476f0b4e173577099f3f4d2e54b1f43f
3
+ size 49999014384
model-00008-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e294127d25d1555087173366c1d6b90af266bcfcba09a525e47213d62698c58
3
+ size 50000185032
model-00009-of-00009.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ac13bac9c079d2160f8d451c3c7d4efb7a750612f76ae0578c6f6321d4e4848
3
+ size 42001612804
model.safetensors.index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7553414afe173015b77b566daae8680f2f1ed8f6840dc79f09e28d436d82770
3
+ size 22332014
quark_profile.yaml ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quark Profiling Results
2
+
3
+ memory_usage:
4
+ - step: "Start"
5
+ timestamp: 1780153972.2896764
6
+ relative_time_secs: 0.0
7
+ cpu_memory_mb: 4173.1
8
+ gpu_memory_mb: 6216.02
9
+ disk_read_mb: 0.0
10
+ disk_write_mb: 0.0
11
+ - step: "Model Loading Start"
12
+ timestamp: 1780153973.5324614
13
+ relative_time_secs: 1.2427849769592285
14
+ cpu_memory_mb: 4173.1
15
+ gpu_memory_mb: 6216.02
16
+ disk_read_mb: 0.0
17
+ disk_write_mb: 0.0
18
+ - step: "Model Loading End"
19
+ timestamp: 1780154914.4454265
20
+ relative_time_secs: 942.1557500362396
21
+ cpu_memory_mb: 5919.9
22
+ gpu_memory_mb: 1742540.06
23
+ disk_read_mb: 537436.0
24
+ disk_write_mb: 6.61
25
+ - step: "Dataset Loading Start"
26
+ timestamp: 1780154975.0346658
27
+ relative_time_secs: 1002.7449893951416
28
+ cpu_memory_mb: 31534.6
29
+ gpu_memory_mb: 1754618.52
30
+ disk_read_mb: 537436.77
31
+ disk_write_mb: 11253.46
32
+ - step: "Dataset Loading End"
33
+ timestamp: 1780154980.1630871
34
+ relative_time_secs: 1007.8734107017517
35
+ cpu_memory_mb: 31537.59
36
+ gpu_memory_mb: 1754618.52
37
+ disk_read_mb: 537436.77
38
+ disk_write_mb: 11253.46
39
+ - step: "Model Quantization Start"
40
+ timestamp: 1780154985.432578
41
+ relative_time_secs: 1013.1429016590118
42
+ cpu_memory_mb: 31613.76
43
+ gpu_memory_mb: 1754618.52
44
+ disk_read_mb: 537436.77
45
+ disk_write_mb: 11253.46
46
+ - step: "Model Preparation Start"
47
+ timestamp: 1780154986.6060688
48
+ relative_time_secs: 1014.3163924217224
49
+ cpu_memory_mb: 31613.76
50
+ gpu_memory_mb: 1754618.52
51
+ disk_read_mb: 537436.77
52
+ disk_write_mb: 11253.47
53
+ - step: "Model Preparation End"
54
+ timestamp: 1780155035.8091686
55
+ relative_time_secs: 1063.519492149353
56
+ cpu_memory_mb: 34365.56
57
+ gpu_memory_mb: 1754955.91
58
+ disk_read_mb: 537436.77
59
+ disk_write_mb: 11253.5
60
+ - step: "Advanced Algorithms Start"
61
+ timestamp: 1780155037.102942
62
+ relative_time_secs: 1064.8132655620575
63
+ cpu_memory_mb: 34365.56
64
+ gpu_memory_mb: 1754955.91
65
+ disk_read_mb: 537436.77
66
+ disk_write_mb: 11253.5
67
+ - step: "Advanced Algorithms End"
68
+ timestamp: 1780155038.2418482
69
+ relative_time_secs: 1065.9521718025208
70
+ cpu_memory_mb: 34365.56
71
+ gpu_memory_mb: 1754955.91
72
+ disk_read_mb: 537436.77
73
+ disk_write_mb: 11253.5
74
+ - step: "Calibration Start"
75
+ timestamp: 1780155039.549469
76
+ relative_time_secs: 1067.2597925662994
77
+ cpu_memory_mb: 34365.56
78
+ gpu_memory_mb: 1754955.91
79
+ disk_read_mb: 537436.77
80
+ disk_write_mb: 11253.5
81
+ - step: "Calibration End"
82
+ timestamp: 1780157901.354939
83
+ relative_time_secs: 3929.065262556076
84
+ cpu_memory_mb: 39218.91
85
+ gpu_memory_mb: 1907319.57
86
+ disk_read_mb: 537436.83
87
+ disk_write_mb: 12613.3
88
+ - step: "Model Quantization End"
89
+ timestamp: 1780158601.0518937
90
+ relative_time_secs: 4628.762217283249
91
+ cpu_memory_mb: 122106.05
92
+ gpu_memory_mb: 1903770.54
93
+ disk_read_mb: 537437.0
94
+ disk_write_mb: 12613.94
95
+ - step: "Freeze Model Start"
96
+ timestamp: 1780158602.1963396
97
+ relative_time_secs: 4629.906663179398
98
+ cpu_memory_mb: 122106.05
99
+ gpu_memory_mb: 1903770.54
100
+ disk_read_mb: 537437.0
101
+ disk_write_mb: 12613.94
102
+ - step: "Freeze Model End"
103
+ timestamp: 1780158620.131704
104
+ relative_time_secs: 4647.842027664185
105
+ cpu_memory_mb: 122363.78
106
+ gpu_memory_mb: 1903770.54
107
+ disk_read_mb: 537437.0
108
+ disk_write_mb: 12613.95
109
+ - step: "Export HF Safetensors Start"
110
+ timestamp: 1780158621.4801466
111
+ relative_time_secs: 4649.190470218658
112
+ cpu_memory_mb: 122363.78
113
+ gpu_memory_mb: 1903770.54
114
+ disk_read_mb: 537437.0
115
+ disk_write_mb: 12613.95
116
+ - step: "Export HF Safetensors End"
117
+ timestamp: 1780159229.9230697
118
+ relative_time_secs: 5257.633393287659
119
+ cpu_memory_mb: 153257.38
120
+ gpu_memory_mb: 1917818.49
121
+ disk_read_mb: 537437.02
122
+ disk_write_mb: 434109.68
123
+ - step: "Model Evaluation Start"
124
+ timestamp: 1780159231.228492
125
+ relative_time_secs: 5258.9388155937195
126
+ cpu_memory_mb: 153257.38
127
+ gpu_memory_mb: 1917818.49
128
+ disk_read_mb: 537437.02
129
+ disk_write_mb: 434109.68
130
+ - step: "Model Evaluation End"
131
+ timestamp: 1780159256.9552286
132
+ relative_time_secs: 5284.665552139282
133
+ cpu_memory_mb: 153643.08
134
+ gpu_memory_mb: 1934880.1
135
+ disk_read_mb: 537438.77
136
+ disk_write_mb: 434109.68
137
+ - step: "End"
138
+ timestamp: 1780159258.2606156
139
+ relative_time_secs: 5285.970939159393
140
+ cpu_memory_mb: 153643.12
141
+ gpu_memory_mb: 1934880.1
142
+ disk_read_mb: 537438.83
143
+ disk_write_mb: 434109.69
144
+
145
+ # Summary Metrics
146
+ total_quantization_time_seconds: 5285.971
147
+ peak_memory_mb: 1424839.86
148
+ peak_gpu_memory_mb: 1934880.1
149
+ total_disk_read_mb: 537438.83
150
+ total_disk_write_mb: 434109.69
151
+
152
+ # Metric Definitions:
153
+ #
154
+ # Checkpoint Metrics (per record):
155
+ # - step: Name of the profiling checkpoint. Common steps include:
156
+ # - "Start": Initial state when profiling begins
157
+ # - "Model Loaded": After loading the ONNX model into memory
158
+ # - "Pre-process Start/End": Before and after model preprocessing
159
+ # - "Calibration Start/End": Before and after calibration data collection
160
+ # - "Quantization (MatMulNBits) Start/End": MatMulNBits quantization phase
161
+ # - "Quantization (Static) Start/End": Static quantization phase
162
+ # - "Post-process Start/End": Before and after post-processing
163
+ # - "Fast Finetune Start/End": Before and after fast finetuning (if enabled)
164
+ # - timestamp: Unix timestamp (seconds since epoch) when this measurement was taken. Useful for correlating with external logs or events.
165
+ # - relative_time_secs: Time elapsed (in seconds) since the "Start" step. Useful for understanding the duration of each phase relative to the beginning of profiling.
166
+ # - cpu_memory_mb: Current Resident Set Size (RSS) in megabytes at this step. This includes memory from the main process and all child processes. RSS represents the portion of memory held in RAM (not swapped out).
167
+ # - gpu_memory_mb: Current GPU memory usage in megabytes. This represents actual GPU memory used by the process, including allocations from PyTorch, ONNX Runtime, TensorRT, and other frameworks. Only available when PyTorch with CUDA/ROCm is installed and GPU is available.
168
+ # - disk_read_mb: Cumulative disk bytes read (in megabytes) since the start of profiling. Measured relative to the baseline captured at the 'Start' checkpoint, including I/O from the main process and all child processes. Only available when psutil is installed and the OS exposes per-process I/O counters (Linux /proc/<pid>/io, Windows; not available on macOS without root).
169
+ # - disk_write_mb: Cumulative disk bytes written (in megabytes) since the start of profiling. Measured relative to the baseline captured at the 'Start' checkpoint, including I/O from the main process and all child processes. Only available when psutil is installed and the OS exposes per-process I/O counters (Linux /proc/<pid>/io, Windows; not available on macOS without root).
170
+ #
171
+ # Summary Metrics (overall):
172
+ # - total_quantization_time_seconds: Total elapsed time (in seconds) from the start of profiling to the end of the quantization process.
173
+ # - peak_memory_mb: Peak resident set size (RSS) in megabytes for the main process during the entire profiling session. On Linux, this is read from VmHWM (high water mark) in /proc/<pid>/status. On Windows, this is the peak working set size. This metric may not be available on all platforms.
174
+ # - peak_gpu_memory_mb: Peak GPU memory usage in megabytes during the entire profiling session. This is the maximum GPU memory used, including allocations from PyTorch, ONNX Runtime, TensorRT, and other frameworks. Only available when PyTorch with CUDA/ROCm is installed and GPU is available.
175
+ # - total_disk_read_mb: Total disk bytes read (in megabytes) during the entire profiling session. Computed as the difference between the final and baseline cumulative read counters, including I/O from the main process and all child processes. Only available when psutil is installed and the OS exposes per-process I/O counters (Linux /proc/<pid>/io, Windows; not available on macOS without root).
176
+ # - total_disk_write_mb: Total disk bytes written (in megabytes) during the entire profiling session. Computed as the difference between the final and baseline cumulative write counters, including I/O from the main process and all child processes. Only available when psutil is installed and the OS exposes per-process I/O counters (Linux /proc/<pid>/io, Windows; not available on macOS without root).
177
+ # - peak_cache_dir_disk_usage_mb: Highest peak increase in disk usage (in megabytes) among all cache directories created during the profiling session, relative to each cache directory's size when monitoring started. Sampled every 1 second by recursively summing file sizes with os.scandir().
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47757b9678da19e468edb3ae37a853996599945b5006914e5b088aff30002386
3
+ size 20217707
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "do_lower_case": false,
5
+ "eos_token": "<|endoftext|>",
6
+ "extra_special_tokens": [
7
+ "<|endoftext|>",
8
+ "[MASK]",
9
+ "[gMASK]",
10
+ "[sMASK]",
11
+ "<sop>",
12
+ "<eop>",
13
+ "<|system|>",
14
+ "<|user|>",
15
+ "<|assistant|>",
16
+ "<|observation|>",
17
+ "<|begin_of_image|>",
18
+ "<|end_of_image|>",
19
+ "<|begin_of_video|>",
20
+ "<|end_of_video|>",
21
+ "<|begin_of_audio|>",
22
+ "<|end_of_audio|>",
23
+ "<|begin_of_transcription|>",
24
+ "<|end_of_transcription|>"
25
+ ],
26
+ "is_local": true,
27
+ "model_max_length": 202752,
28
+ "model_specific_special_tokens": {},
29
+ "pad_token": "<|endoftext|>",
30
+ "padding_side": "left",
31
+ "remove_space": false,
32
+ "tokenizer_class": "TokenizersBackend"
33
+ }