shawon commited on
Commit
d895867
·
verified ·
1 Parent(s): 889da3e

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: text-generation
4
+ library_name: transformers
5
+ tags:
6
+ - vllm
7
+ - mlx
8
+ - mlx-my-repo
9
+ base_model: openai/gpt-oss-120b
10
+ ---
11
+
12
+ # shawon/gpt-oss-120b-mlx-8Bit
13
+
14
+ The Model [shawon/gpt-oss-120b-mlx-8Bit](https://huggingface.co/shawon/gpt-oss-120b-mlx-8Bit) was converted to MLX format from [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b) using mlx-lm version **0.31.2**.
15
+
16
+ ## Use with mlx
17
+
18
+ ```bash
19
+ pip install mlx-lm
20
+ ```
21
+
22
+ ```python
23
+ from mlx_lm import load, generate
24
+
25
+ model, tokenizer = load("shawon/gpt-oss-120b-mlx-8Bit")
26
+
27
+ prompt="hello"
28
+
29
+ if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
30
+ messages = [{"role": "user", "content": prompt}]
31
+ prompt = tokenizer.apply_chat_template(
32
+ messages, tokenize=False, add_generation_prompt=True
33
+ )
34
+
35
+ response = generate(model, tokenizer, prompt=prompt, verbose=True)
36
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {#-
2
+ In addition to the normal inputs of `messages` and `tools`, this template also accepts the
3
+ following kwargs:
4
+ - "builtin_tools": A list, can contain "browser" and/or "python".
5
+ - "model_identity": A string that optionally describes the model identity.
6
+ - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
7
+ #}
8
+
9
+ {#- Tool Definition Rendering ============================================== #}
10
+ {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
11
+ {%- if param_spec.type == "array" -%}
12
+ {%- if param_spec['items'] -%}
13
+ {%- if param_spec['items']['type'] == "string" -%}
14
+ {{- "string[]" }}
15
+ {%- elif param_spec['items']['type'] == "number" -%}
16
+ {{- "number[]" }}
17
+ {%- elif param_spec['items']['type'] == "integer" -%}
18
+ {{- "number[]" }}
19
+ {%- elif param_spec['items']['type'] == "boolean" -%}
20
+ {{- "boolean[]" }}
21
+ {%- else -%}
22
+ {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
23
+ {%- if inner_type == "object | object" or inner_type|length > 50 -%}
24
+ {{- "any[]" }}
25
+ {%- else -%}
26
+ {{- inner_type + "[]" }}
27
+ {%- endif -%}
28
+ {%- endif -%}
29
+ {%- if param_spec.nullable -%}
30
+ {{- " | null" }}
31
+ {%- endif -%}
32
+ {%- else -%}
33
+ {{- "any[]" }}
34
+ {%- if param_spec.nullable -%}
35
+ {{- " | null" }}
36
+ {%- endif -%}
37
+ {%- endif -%}
38
+ {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
39
+ {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
40
+ {%- if param_spec.type | length > 1 -%}
41
+ {{- param_spec.type | join(" | ") }}
42
+ {%- else -%}
43
+ {{- param_spec.type[0] }}
44
+ {%- endif -%}
45
+ {%- elif param_spec.oneOf -%}
46
+ {#- Handle oneOf schemas - check for complex unions and fallback to any #}
47
+ {%- set has_object_variants = false -%}
48
+ {%- for variant in param_spec.oneOf -%}
49
+ {%- if variant.type == "object" -%}
50
+ {%- set has_object_variants = true -%}
51
+ {%- endif -%}
52
+ {%- endfor -%}
53
+ {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
54
+ {{- "any" }}
55
+ {%- else -%}
56
+ {%- for variant in param_spec.oneOf -%}
57
+ {{- render_typescript_type(variant, required_params) -}}
58
+ {%- if variant.description %}
59
+ {{- "// " + variant.description }}
60
+ {%- endif -%}
61
+ {%- if variant.default is defined %}
62
+ {{ "// default: " + variant.default|tojson }}
63
+ {%- endif -%}
64
+ {%- if not loop.last %}
65
+ {{- " | " }}
66
+ {% endif -%}
67
+ {%- endfor -%}
68
+ {%- endif -%}
69
+ {%- elif param_spec.type == "string" -%}
70
+ {%- if param_spec.enum -%}
71
+ {{- '"' + param_spec.enum|join('" | "') + '"' -}}
72
+ {%- else -%}
73
+ {{- "string" }}
74
+ {%- if param_spec.nullable %}
75
+ {{- " | null" }}
76
+ {%- endif -%}
77
+ {%- endif -%}
78
+ {%- elif param_spec.type == "number" -%}
79
+ {{- "number" }}
80
+ {%- elif param_spec.type == "integer" -%}
81
+ {{- "number" }}
82
+ {%- elif param_spec.type == "boolean" -%}
83
+ {{- "boolean" }}
84
+
85
+ {%- elif param_spec.type == "object" -%}
86
+ {%- if param_spec.properties -%}
87
+ {{- "{\n" }}
88
+ {%- for prop_name, prop_spec in param_spec.properties.items() -%}
89
+ {{- prop_name -}}
90
+ {%- if prop_name not in (param_spec.required or []) -%}
91
+ {{- "?" }}
92
+ {%- endif -%}
93
+ {{- ": " }}
94
+ {{ render_typescript_type(prop_spec, param_spec.required or []) }}
95
+ {%- if not loop.last -%}
96
+ {{-", " }}
97
+ {%- endif -%}
98
+ {%- endfor -%}
99
+ {{- "}" }}
100
+ {%- else -%}
101
+ {{- "object" }}
102
+ {%- endif -%}
103
+ {%- else -%}
104
+ {{- "any" }}
105
+ {%- endif -%}
106
+ {%- endmacro -%}
107
+
108
+ {%- macro render_tool_namespace(namespace_name, tools) -%}
109
+ {{- "## " + namespace_name + "\n\n" }}
110
+ {{- "namespace " + namespace_name + " {\n\n" }}
111
+ {%- for tool in tools %}
112
+ {%- set tool = tool.function %}
113
+ {{- "// " + tool.description + "\n" }}
114
+ {{- "type "+ tool.name + " = " }}
115
+ {%- if tool.parameters and tool.parameters.properties %}
116
+ {{- "(_: {\n" }}
117
+ {%- for param_name, param_spec in tool.parameters.properties.items() %}
118
+ {%- if param_spec.description %}
119
+ {{- "// " + param_spec.description + "\n" }}
120
+ {%- endif %}
121
+ {{- param_name }}
122
+ {%- if param_name not in (tool.parameters.required or []) -%}
123
+ {{- "?" }}
124
+ {%- endif -%}
125
+ {{- ": " }}
126
+ {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
127
+ {%- if param_spec.default is defined -%}
128
+ {%- if param_spec.enum %}
129
+ {{- ", // default: " + param_spec.default }}
130
+ {%- elif param_spec.oneOf %}
131
+ {{- "// default: " + param_spec.default }}
132
+ {%- else %}
133
+ {{- ", // default: " + param_spec.default|tojson }}
134
+ {%- endif -%}
135
+ {%- endif -%}
136
+ {%- if not loop.last %}
137
+ {{- ",\n" }}
138
+ {%- else %}
139
+ {{- ",\n" }}
140
+ {%- endif -%}
141
+ {%- endfor %}
142
+ {{- "}) => any;\n\n" }}
143
+ {%- else -%}
144
+ {{- "() => any;\n\n" }}
145
+ {%- endif -%}
146
+ {%- endfor %}
147
+ {{- "} // namespace " + namespace_name }}
148
+ {%- endmacro -%}
149
+
150
+ {%- macro render_builtin_tools(browser_tool, python_tool) -%}
151
+ {%- if browser_tool %}
152
+ {{- "## browser\n\n" }}
153
+ {{- "// Tool for browsing.\n" }}
154
+ {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
155
+ {{- "// Cite information from the tool using the following format:\n" }}
156
+ {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
157
+ {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
158
+ {{- "// sources=web (default: web)\n" }}
159
+ {{- "namespace browser {\n\n" }}
160
+ {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
161
+ {{- "type search = (_: {\n" }}
162
+ {{- "query: string,\n" }}
163
+ {{- "topn?: number, // default: 10\n" }}
164
+ {{- "source?: string,\n" }}
165
+ {{- "}) => any;\n\n" }}
166
+ {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
167
+ {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
168
+ {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
169
+ {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
170
+ {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
171
+ {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
172
+ {{- "type open = (_: {\n" }}
173
+ {{- "id?: number | string, // default: -1\n" }}
174
+ {{- "cursor?: number, // default: -1\n" }}
175
+ {{- "loc?: number, // default: -1\n" }}
176
+ {{- "num_lines?: number, // default: -1\n" }}
177
+ {{- "view_source?: boolean, // default: false\n" }}
178
+ {{- "source?: string,\n" }}
179
+ {{- "}) => any;\n\n" }}
180
+ {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
181
+ {{- "type find = (_: {\n" }}
182
+ {{- "pattern: string,\n" }}
183
+ {{- "cursor?: number, // default: -1\n" }}
184
+ {{- "}) => any;\n\n" }}
185
+ {{- "} // namespace browser\n\n" }}
186
+ {%- endif -%}
187
+
188
+ {%- if python_tool %}
189
+ {{- "## python\n\n" }}
190
+ {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
191
+ {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
192
+ {%- endif -%}
193
+ {%- endmacro -%}
194
+
195
+ {#- System Message Construction ============================================ #}
196
+ {%- macro build_system_message() -%}
197
+ {%- if model_identity is not defined %}
198
+ {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %}
199
+ {%- endif %}
200
+ {{- model_identity + "\n" }}
201
+ {{- "Knowledge cutoff: 2024-06\n" }}
202
+ {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
203
+ {%- if reasoning_effort is not defined %}
204
+ {%- set reasoning_effort = "medium" %}
205
+ {%- endif %}
206
+ {{- "Reasoning: " + reasoning_effort + "\n\n" }}
207
+ {%- if builtin_tools %}
208
+ {{- "# Tools\n\n" }}
209
+ {%- set available_builtin_tools = namespace(browser=false, python=false) %}
210
+ {%- for tool in builtin_tools %}
211
+ {%- if tool == "browser" %}
212
+ {%- set available_builtin_tools.browser = true %}
213
+ {%- elif tool == "python" %}
214
+ {%- set available_builtin_tools.python = true %}
215
+ {%- endif %}
216
+ {%- endfor %}
217
+ {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
218
+ {%- endif -%}
219
+ {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
220
+ {%- if tools -%}
221
+ {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
222
+ {%- endif -%}
223
+ {%- endmacro -%}
224
+
225
+ {#- Main Template Logic ================================================= #}
226
+ {#- Set defaults #}
227
+
228
+ {#- Render system message #}
229
+ {{- "<|start|>system<|message|>" }}
230
+ {{- build_system_message() }}
231
+ {{- "<|end|>" }}
232
+
233
+ {#- Extract developer message #}
234
+ {%- if messages[0].role == "developer" or messages[0].role == "system" %}
235
+ {%- set developer_message = messages[0].content %}
236
+ {%- set loop_messages = messages[1:] %}
237
+ {%- else %}
238
+ {%- set developer_message = "" %}
239
+ {%- set loop_messages = messages %}
240
+ {%- endif %}
241
+
242
+ {#- Render developer message #}
243
+ {%- if developer_message or tools %}
244
+ {{- "<|start|>developer<|message|>" }}
245
+ {%- if developer_message %}
246
+ {{- "# Instructions\n\n" }}
247
+ {{- developer_message }}
248
+ {{- "\n\n" }}
249
+ {%- endif %}
250
+ {%- if tools -%}
251
+ {{- "# Tools\n\n" }}
252
+ {{- render_tool_namespace("functions", tools) }}
253
+ {%- endif -%}
254
+ {{- "<|end|>" }}
255
+ {%- endif %}
256
+
257
+ {#- Render messages #}
258
+ {%- set last_tool_call = namespace(name=none) %}
259
+ {%- for message in loop_messages -%}
260
+ {#- At this point only assistant/user/tool messages should remain #}
261
+ {%- if message.role == 'assistant' -%}
262
+ {#- Checks to ensure the messages are being passed in the format we expect #}
263
+ {%- if "content" in message %}
264
+ {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %}
265
+ {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }}
266
+ {%- endif %}
267
+ {%- endif %}
268
+ {%- if "thinking" in message %}
269
+ {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %}
270
+ {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }}
271
+ {%- endif %}
272
+ {%- endif %}
273
+ {%- if "tool_calls" in message %}
274
+ {#- We need very careful handling here - we want to drop the tool call analysis message if the model #}
275
+ {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #}
276
+ {#- when we render CoT/analysis messages in inference. #}
277
+ {%- set future_final_message = namespace(found=false) %}
278
+ {%- for future_message in loop_messages[loop.index:] %}
279
+ {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %}
280
+ {%- set future_final_message.found = true %}
281
+ {%- endif %}
282
+ {%- endfor %}
283
+ {#- We assume max 1 tool call per message, and so we infer the tool call name #}
284
+ {#- in "tool" messages from the most recent assistant tool call name #}
285
+ {%- set tool_call = message.tool_calls[0] %}
286
+ {%- if tool_call.function %}
287
+ {%- set tool_call = tool_call.function %}
288
+ {%- endif %}
289
+ {%- if message.content and message.thinking %}
290
+ {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }}
291
+ {%- elif message.content and not future_final_message.found %}
292
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
293
+ {%- elif message.thinking and not future_final_message.found %}
294
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
295
+ {%- endif %}
296
+ {{- "<|start|>assistant to=" }}
297
+ {{- "functions." + tool_call.name + "<|channel|>commentary " }}
298
+ {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }}
299
+ {{- tool_call.arguments|tojson }}
300
+ {{- "<|call|>" }}
301
+ {%- set last_tool_call.name = tool_call.name %}
302
+ {%- elif loop.last and not add_generation_prompt %}
303
+ {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
304
+ {#- This is a situation that should only occur in training, never in inference. #}
305
+ {%- if "thinking" in message %}
306
+ {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
307
+ {%- endif %}
308
+ {#- <|return|> indicates the end of generation, but <|end|> does not #}
309
+ {#- <|return|> should never be an input to the model, but we include it as the final token #}
310
+ {#- when training, so the model learns to emit it. #}
311
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
312
+ {%- else %}
313
+ {#- CoT is dropped during all previous turns, so we never render it for inference #}
314
+ {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
315
+ {%- set last_tool_call.name = none %}
316
+ {%- endif %}
317
+ {%- elif message.role == 'tool' -%}
318
+ {%- if last_tool_call.name is none %}
319
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
320
+ {%- endif %}
321
+ {{- "<|start|>functions." + last_tool_call.name }}
322
+ {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
323
+ {%- elif message.role == 'user' -%}
324
+ {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
325
+ {%- endif -%}
326
+ {%- endfor -%}
327
+
328
+ {#- Generation prompt #}
329
+ {%- if add_generation_prompt -%}
330
+ <|start|>assistant
331
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,1841 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GptOssForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": [
8
+ 200002,
9
+ 199999,
10
+ 200012
11
+ ],
12
+ "experts_per_token": 4,
13
+ "head_dim": 64,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 2880,
16
+ "initial_context_length": 4096,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 2880,
19
+ "layer_types": [
20
+ "sliding_attention",
21
+ "full_attention",
22
+ "sliding_attention",
23
+ "full_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "full_attention",
28
+ "sliding_attention",
29
+ "full_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "full_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "full_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "full_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "full_attention",
54
+ "sliding_attention",
55
+ "full_attention"
56
+ ],
57
+ "max_position_embeddings": 131072,
58
+ "model_type": "gpt_oss",
59
+ "num_attention_heads": 64,
60
+ "num_experts_per_tok": 4,
61
+ "num_hidden_layers": 36,
62
+ "num_key_value_heads": 8,
63
+ "num_local_experts": 128,
64
+ "output_router_logits": false,
65
+ "pad_token_id": 199999,
66
+ "quantization": {
67
+ "group_size": 32,
68
+ "bits": 4,
69
+ "mode": "mxfp4",
70
+ "model.embed_tokens": {
71
+ "group_size": 64,
72
+ "bits": 8,
73
+ "mode": "affine"
74
+ },
75
+ "model.layers.0.self_attn.q_proj": {
76
+ "group_size": 64,
77
+ "bits": 8,
78
+ "mode": "affine"
79
+ },
80
+ "model.layers.0.self_attn.k_proj": {
81
+ "group_size": 64,
82
+ "bits": 8,
83
+ "mode": "affine"
84
+ },
85
+ "model.layers.0.self_attn.v_proj": {
86
+ "group_size": 64,
87
+ "bits": 8,
88
+ "mode": "affine"
89
+ },
90
+ "model.layers.0.self_attn.o_proj": {
91
+ "group_size": 64,
92
+ "bits": 8,
93
+ "mode": "affine"
94
+ },
95
+ "model.layers.0.mlp.router": {
96
+ "group_size": 64,
97
+ "bits": 8
98
+ },
99
+ "model.layers.1.self_attn.q_proj": {
100
+ "group_size": 64,
101
+ "bits": 8,
102
+ "mode": "affine"
103
+ },
104
+ "model.layers.1.self_attn.k_proj": {
105
+ "group_size": 64,
106
+ "bits": 8,
107
+ "mode": "affine"
108
+ },
109
+ "model.layers.1.self_attn.v_proj": {
110
+ "group_size": 64,
111
+ "bits": 8,
112
+ "mode": "affine"
113
+ },
114
+ "model.layers.1.self_attn.o_proj": {
115
+ "group_size": 64,
116
+ "bits": 8,
117
+ "mode": "affine"
118
+ },
119
+ "model.layers.1.mlp.router": {
120
+ "group_size": 64,
121
+ "bits": 8
122
+ },
123
+ "model.layers.2.self_attn.q_proj": {
124
+ "group_size": 64,
125
+ "bits": 8,
126
+ "mode": "affine"
127
+ },
128
+ "model.layers.2.self_attn.k_proj": {
129
+ "group_size": 64,
130
+ "bits": 8,
131
+ "mode": "affine"
132
+ },
133
+ "model.layers.2.self_attn.v_proj": {
134
+ "group_size": 64,
135
+ "bits": 8,
136
+ "mode": "affine"
137
+ },
138
+ "model.layers.2.self_attn.o_proj": {
139
+ "group_size": 64,
140
+ "bits": 8,
141
+ "mode": "affine"
142
+ },
143
+ "model.layers.2.mlp.router": {
144
+ "group_size": 64,
145
+ "bits": 8
146
+ },
147
+ "model.layers.3.self_attn.q_proj": {
148
+ "group_size": 64,
149
+ "bits": 8,
150
+ "mode": "affine"
151
+ },
152
+ "model.layers.3.self_attn.k_proj": {
153
+ "group_size": 64,
154
+ "bits": 8,
155
+ "mode": "affine"
156
+ },
157
+ "model.layers.3.self_attn.v_proj": {
158
+ "group_size": 64,
159
+ "bits": 8,
160
+ "mode": "affine"
161
+ },
162
+ "model.layers.3.self_attn.o_proj": {
163
+ "group_size": 64,
164
+ "bits": 8,
165
+ "mode": "affine"
166
+ },
167
+ "model.layers.3.mlp.router": {
168
+ "group_size": 64,
169
+ "bits": 8
170
+ },
171
+ "model.layers.4.self_attn.q_proj": {
172
+ "group_size": 64,
173
+ "bits": 8,
174
+ "mode": "affine"
175
+ },
176
+ "model.layers.4.self_attn.k_proj": {
177
+ "group_size": 64,
178
+ "bits": 8,
179
+ "mode": "affine"
180
+ },
181
+ "model.layers.4.self_attn.v_proj": {
182
+ "group_size": 64,
183
+ "bits": 8,
184
+ "mode": "affine"
185
+ },
186
+ "model.layers.4.self_attn.o_proj": {
187
+ "group_size": 64,
188
+ "bits": 8,
189
+ "mode": "affine"
190
+ },
191
+ "model.layers.4.mlp.router": {
192
+ "group_size": 64,
193
+ "bits": 8
194
+ },
195
+ "model.layers.5.self_attn.q_proj": {
196
+ "group_size": 64,
197
+ "bits": 8,
198
+ "mode": "affine"
199
+ },
200
+ "model.layers.5.self_attn.k_proj": {
201
+ "group_size": 64,
202
+ "bits": 8,
203
+ "mode": "affine"
204
+ },
205
+ "model.layers.5.self_attn.v_proj": {
206
+ "group_size": 64,
207
+ "bits": 8,
208
+ "mode": "affine"
209
+ },
210
+ "model.layers.5.self_attn.o_proj": {
211
+ "group_size": 64,
212
+ "bits": 8,
213
+ "mode": "affine"
214
+ },
215
+ "model.layers.5.mlp.router": {
216
+ "group_size": 64,
217
+ "bits": 8
218
+ },
219
+ "model.layers.6.self_attn.q_proj": {
220
+ "group_size": 64,
221
+ "bits": 8,
222
+ "mode": "affine"
223
+ },
224
+ "model.layers.6.self_attn.k_proj": {
225
+ "group_size": 64,
226
+ "bits": 8,
227
+ "mode": "affine"
228
+ },
229
+ "model.layers.6.self_attn.v_proj": {
230
+ "group_size": 64,
231
+ "bits": 8,
232
+ "mode": "affine"
233
+ },
234
+ "model.layers.6.self_attn.o_proj": {
235
+ "group_size": 64,
236
+ "bits": 8,
237
+ "mode": "affine"
238
+ },
239
+ "model.layers.6.mlp.router": {
240
+ "group_size": 64,
241
+ "bits": 8
242
+ },
243
+ "model.layers.7.self_attn.q_proj": {
244
+ "group_size": 64,
245
+ "bits": 8,
246
+ "mode": "affine"
247
+ },
248
+ "model.layers.7.self_attn.k_proj": {
249
+ "group_size": 64,
250
+ "bits": 8,
251
+ "mode": "affine"
252
+ },
253
+ "model.layers.7.self_attn.v_proj": {
254
+ "group_size": 64,
255
+ "bits": 8,
256
+ "mode": "affine"
257
+ },
258
+ "model.layers.7.self_attn.o_proj": {
259
+ "group_size": 64,
260
+ "bits": 8,
261
+ "mode": "affine"
262
+ },
263
+ "model.layers.7.mlp.router": {
264
+ "group_size": 64,
265
+ "bits": 8
266
+ },
267
+ "model.layers.8.self_attn.q_proj": {
268
+ "group_size": 64,
269
+ "bits": 8,
270
+ "mode": "affine"
271
+ },
272
+ "model.layers.8.self_attn.k_proj": {
273
+ "group_size": 64,
274
+ "bits": 8,
275
+ "mode": "affine"
276
+ },
277
+ "model.layers.8.self_attn.v_proj": {
278
+ "group_size": 64,
279
+ "bits": 8,
280
+ "mode": "affine"
281
+ },
282
+ "model.layers.8.self_attn.o_proj": {
283
+ "group_size": 64,
284
+ "bits": 8,
285
+ "mode": "affine"
286
+ },
287
+ "model.layers.8.mlp.router": {
288
+ "group_size": 64,
289
+ "bits": 8
290
+ },
291
+ "model.layers.9.self_attn.q_proj": {
292
+ "group_size": 64,
293
+ "bits": 8,
294
+ "mode": "affine"
295
+ },
296
+ "model.layers.9.self_attn.k_proj": {
297
+ "group_size": 64,
298
+ "bits": 8,
299
+ "mode": "affine"
300
+ },
301
+ "model.layers.9.self_attn.v_proj": {
302
+ "group_size": 64,
303
+ "bits": 8,
304
+ "mode": "affine"
305
+ },
306
+ "model.layers.9.self_attn.o_proj": {
307
+ "group_size": 64,
308
+ "bits": 8,
309
+ "mode": "affine"
310
+ },
311
+ "model.layers.9.mlp.router": {
312
+ "group_size": 64,
313
+ "bits": 8
314
+ },
315
+ "model.layers.10.self_attn.q_proj": {
316
+ "group_size": 64,
317
+ "bits": 8,
318
+ "mode": "affine"
319
+ },
320
+ "model.layers.10.self_attn.k_proj": {
321
+ "group_size": 64,
322
+ "bits": 8,
323
+ "mode": "affine"
324
+ },
325
+ "model.layers.10.self_attn.v_proj": {
326
+ "group_size": 64,
327
+ "bits": 8,
328
+ "mode": "affine"
329
+ },
330
+ "model.layers.10.self_attn.o_proj": {
331
+ "group_size": 64,
332
+ "bits": 8,
333
+ "mode": "affine"
334
+ },
335
+ "model.layers.10.mlp.router": {
336
+ "group_size": 64,
337
+ "bits": 8
338
+ },
339
+ "model.layers.11.self_attn.q_proj": {
340
+ "group_size": 64,
341
+ "bits": 8,
342
+ "mode": "affine"
343
+ },
344
+ "model.layers.11.self_attn.k_proj": {
345
+ "group_size": 64,
346
+ "bits": 8,
347
+ "mode": "affine"
348
+ },
349
+ "model.layers.11.self_attn.v_proj": {
350
+ "group_size": 64,
351
+ "bits": 8,
352
+ "mode": "affine"
353
+ },
354
+ "model.layers.11.self_attn.o_proj": {
355
+ "group_size": 64,
356
+ "bits": 8,
357
+ "mode": "affine"
358
+ },
359
+ "model.layers.11.mlp.router": {
360
+ "group_size": 64,
361
+ "bits": 8
362
+ },
363
+ "model.layers.12.self_attn.q_proj": {
364
+ "group_size": 64,
365
+ "bits": 8,
366
+ "mode": "affine"
367
+ },
368
+ "model.layers.12.self_attn.k_proj": {
369
+ "group_size": 64,
370
+ "bits": 8,
371
+ "mode": "affine"
372
+ },
373
+ "model.layers.12.self_attn.v_proj": {
374
+ "group_size": 64,
375
+ "bits": 8,
376
+ "mode": "affine"
377
+ },
378
+ "model.layers.12.self_attn.o_proj": {
379
+ "group_size": 64,
380
+ "bits": 8,
381
+ "mode": "affine"
382
+ },
383
+ "model.layers.12.mlp.router": {
384
+ "group_size": 64,
385
+ "bits": 8
386
+ },
387
+ "model.layers.13.self_attn.q_proj": {
388
+ "group_size": 64,
389
+ "bits": 8,
390
+ "mode": "affine"
391
+ },
392
+ "model.layers.13.self_attn.k_proj": {
393
+ "group_size": 64,
394
+ "bits": 8,
395
+ "mode": "affine"
396
+ },
397
+ "model.layers.13.self_attn.v_proj": {
398
+ "group_size": 64,
399
+ "bits": 8,
400
+ "mode": "affine"
401
+ },
402
+ "model.layers.13.self_attn.o_proj": {
403
+ "group_size": 64,
404
+ "bits": 8,
405
+ "mode": "affine"
406
+ },
407
+ "model.layers.13.mlp.router": {
408
+ "group_size": 64,
409
+ "bits": 8
410
+ },
411
+ "model.layers.14.self_attn.q_proj": {
412
+ "group_size": 64,
413
+ "bits": 8,
414
+ "mode": "affine"
415
+ },
416
+ "model.layers.14.self_attn.k_proj": {
417
+ "group_size": 64,
418
+ "bits": 8,
419
+ "mode": "affine"
420
+ },
421
+ "model.layers.14.self_attn.v_proj": {
422
+ "group_size": 64,
423
+ "bits": 8,
424
+ "mode": "affine"
425
+ },
426
+ "model.layers.14.self_attn.o_proj": {
427
+ "group_size": 64,
428
+ "bits": 8,
429
+ "mode": "affine"
430
+ },
431
+ "model.layers.14.mlp.router": {
432
+ "group_size": 64,
433
+ "bits": 8
434
+ },
435
+ "model.layers.15.self_attn.q_proj": {
436
+ "group_size": 64,
437
+ "bits": 8,
438
+ "mode": "affine"
439
+ },
440
+ "model.layers.15.self_attn.k_proj": {
441
+ "group_size": 64,
442
+ "bits": 8,
443
+ "mode": "affine"
444
+ },
445
+ "model.layers.15.self_attn.v_proj": {
446
+ "group_size": 64,
447
+ "bits": 8,
448
+ "mode": "affine"
449
+ },
450
+ "model.layers.15.self_attn.o_proj": {
451
+ "group_size": 64,
452
+ "bits": 8,
453
+ "mode": "affine"
454
+ },
455
+ "model.layers.15.mlp.router": {
456
+ "group_size": 64,
457
+ "bits": 8
458
+ },
459
+ "model.layers.16.self_attn.q_proj": {
460
+ "group_size": 64,
461
+ "bits": 8,
462
+ "mode": "affine"
463
+ },
464
+ "model.layers.16.self_attn.k_proj": {
465
+ "group_size": 64,
466
+ "bits": 8,
467
+ "mode": "affine"
468
+ },
469
+ "model.layers.16.self_attn.v_proj": {
470
+ "group_size": 64,
471
+ "bits": 8,
472
+ "mode": "affine"
473
+ },
474
+ "model.layers.16.self_attn.o_proj": {
475
+ "group_size": 64,
476
+ "bits": 8,
477
+ "mode": "affine"
478
+ },
479
+ "model.layers.16.mlp.router": {
480
+ "group_size": 64,
481
+ "bits": 8
482
+ },
483
+ "model.layers.17.self_attn.q_proj": {
484
+ "group_size": 64,
485
+ "bits": 8,
486
+ "mode": "affine"
487
+ },
488
+ "model.layers.17.self_attn.k_proj": {
489
+ "group_size": 64,
490
+ "bits": 8,
491
+ "mode": "affine"
492
+ },
493
+ "model.layers.17.self_attn.v_proj": {
494
+ "group_size": 64,
495
+ "bits": 8,
496
+ "mode": "affine"
497
+ },
498
+ "model.layers.17.self_attn.o_proj": {
499
+ "group_size": 64,
500
+ "bits": 8,
501
+ "mode": "affine"
502
+ },
503
+ "model.layers.17.mlp.router": {
504
+ "group_size": 64,
505
+ "bits": 8
506
+ },
507
+ "model.layers.18.self_attn.q_proj": {
508
+ "group_size": 64,
509
+ "bits": 8,
510
+ "mode": "affine"
511
+ },
512
+ "model.layers.18.self_attn.k_proj": {
513
+ "group_size": 64,
514
+ "bits": 8,
515
+ "mode": "affine"
516
+ },
517
+ "model.layers.18.self_attn.v_proj": {
518
+ "group_size": 64,
519
+ "bits": 8,
520
+ "mode": "affine"
521
+ },
522
+ "model.layers.18.self_attn.o_proj": {
523
+ "group_size": 64,
524
+ "bits": 8,
525
+ "mode": "affine"
526
+ },
527
+ "model.layers.18.mlp.router": {
528
+ "group_size": 64,
529
+ "bits": 8
530
+ },
531
+ "model.layers.19.self_attn.q_proj": {
532
+ "group_size": 64,
533
+ "bits": 8,
534
+ "mode": "affine"
535
+ },
536
+ "model.layers.19.self_attn.k_proj": {
537
+ "group_size": 64,
538
+ "bits": 8,
539
+ "mode": "affine"
540
+ },
541
+ "model.layers.19.self_attn.v_proj": {
542
+ "group_size": 64,
543
+ "bits": 8,
544
+ "mode": "affine"
545
+ },
546
+ "model.layers.19.self_attn.o_proj": {
547
+ "group_size": 64,
548
+ "bits": 8,
549
+ "mode": "affine"
550
+ },
551
+ "model.layers.19.mlp.router": {
552
+ "group_size": 64,
553
+ "bits": 8
554
+ },
555
+ "model.layers.20.self_attn.q_proj": {
556
+ "group_size": 64,
557
+ "bits": 8,
558
+ "mode": "affine"
559
+ },
560
+ "model.layers.20.self_attn.k_proj": {
561
+ "group_size": 64,
562
+ "bits": 8,
563
+ "mode": "affine"
564
+ },
565
+ "model.layers.20.self_attn.v_proj": {
566
+ "group_size": 64,
567
+ "bits": 8,
568
+ "mode": "affine"
569
+ },
570
+ "model.layers.20.self_attn.o_proj": {
571
+ "group_size": 64,
572
+ "bits": 8,
573
+ "mode": "affine"
574
+ },
575
+ "model.layers.20.mlp.router": {
576
+ "group_size": 64,
577
+ "bits": 8
578
+ },
579
+ "model.layers.21.self_attn.q_proj": {
580
+ "group_size": 64,
581
+ "bits": 8,
582
+ "mode": "affine"
583
+ },
584
+ "model.layers.21.self_attn.k_proj": {
585
+ "group_size": 64,
586
+ "bits": 8,
587
+ "mode": "affine"
588
+ },
589
+ "model.layers.21.self_attn.v_proj": {
590
+ "group_size": 64,
591
+ "bits": 8,
592
+ "mode": "affine"
593
+ },
594
+ "model.layers.21.self_attn.o_proj": {
595
+ "group_size": 64,
596
+ "bits": 8,
597
+ "mode": "affine"
598
+ },
599
+ "model.layers.21.mlp.router": {
600
+ "group_size": 64,
601
+ "bits": 8
602
+ },
603
+ "model.layers.22.self_attn.q_proj": {
604
+ "group_size": 64,
605
+ "bits": 8,
606
+ "mode": "affine"
607
+ },
608
+ "model.layers.22.self_attn.k_proj": {
609
+ "group_size": 64,
610
+ "bits": 8,
611
+ "mode": "affine"
612
+ },
613
+ "model.layers.22.self_attn.v_proj": {
614
+ "group_size": 64,
615
+ "bits": 8,
616
+ "mode": "affine"
617
+ },
618
+ "model.layers.22.self_attn.o_proj": {
619
+ "group_size": 64,
620
+ "bits": 8,
621
+ "mode": "affine"
622
+ },
623
+ "model.layers.22.mlp.router": {
624
+ "group_size": 64,
625
+ "bits": 8
626
+ },
627
+ "model.layers.23.self_attn.q_proj": {
628
+ "group_size": 64,
629
+ "bits": 8,
630
+ "mode": "affine"
631
+ },
632
+ "model.layers.23.self_attn.k_proj": {
633
+ "group_size": 64,
634
+ "bits": 8,
635
+ "mode": "affine"
636
+ },
637
+ "model.layers.23.self_attn.v_proj": {
638
+ "group_size": 64,
639
+ "bits": 8,
640
+ "mode": "affine"
641
+ },
642
+ "model.layers.23.self_attn.o_proj": {
643
+ "group_size": 64,
644
+ "bits": 8,
645
+ "mode": "affine"
646
+ },
647
+ "model.layers.23.mlp.router": {
648
+ "group_size": 64,
649
+ "bits": 8
650
+ },
651
+ "model.layers.24.self_attn.q_proj": {
652
+ "group_size": 64,
653
+ "bits": 8,
654
+ "mode": "affine"
655
+ },
656
+ "model.layers.24.self_attn.k_proj": {
657
+ "group_size": 64,
658
+ "bits": 8,
659
+ "mode": "affine"
660
+ },
661
+ "model.layers.24.self_attn.v_proj": {
662
+ "group_size": 64,
663
+ "bits": 8,
664
+ "mode": "affine"
665
+ },
666
+ "model.layers.24.self_attn.o_proj": {
667
+ "group_size": 64,
668
+ "bits": 8,
669
+ "mode": "affine"
670
+ },
671
+ "model.layers.24.mlp.router": {
672
+ "group_size": 64,
673
+ "bits": 8
674
+ },
675
+ "model.layers.25.self_attn.q_proj": {
676
+ "group_size": 64,
677
+ "bits": 8,
678
+ "mode": "affine"
679
+ },
680
+ "model.layers.25.self_attn.k_proj": {
681
+ "group_size": 64,
682
+ "bits": 8,
683
+ "mode": "affine"
684
+ },
685
+ "model.layers.25.self_attn.v_proj": {
686
+ "group_size": 64,
687
+ "bits": 8,
688
+ "mode": "affine"
689
+ },
690
+ "model.layers.25.self_attn.o_proj": {
691
+ "group_size": 64,
692
+ "bits": 8,
693
+ "mode": "affine"
694
+ },
695
+ "model.layers.25.mlp.router": {
696
+ "group_size": 64,
697
+ "bits": 8
698
+ },
699
+ "model.layers.26.self_attn.q_proj": {
700
+ "group_size": 64,
701
+ "bits": 8,
702
+ "mode": "affine"
703
+ },
704
+ "model.layers.26.self_attn.k_proj": {
705
+ "group_size": 64,
706
+ "bits": 8,
707
+ "mode": "affine"
708
+ },
709
+ "model.layers.26.self_attn.v_proj": {
710
+ "group_size": 64,
711
+ "bits": 8,
712
+ "mode": "affine"
713
+ },
714
+ "model.layers.26.self_attn.o_proj": {
715
+ "group_size": 64,
716
+ "bits": 8,
717
+ "mode": "affine"
718
+ },
719
+ "model.layers.26.mlp.router": {
720
+ "group_size": 64,
721
+ "bits": 8
722
+ },
723
+ "model.layers.27.self_attn.q_proj": {
724
+ "group_size": 64,
725
+ "bits": 8,
726
+ "mode": "affine"
727
+ },
728
+ "model.layers.27.self_attn.k_proj": {
729
+ "group_size": 64,
730
+ "bits": 8,
731
+ "mode": "affine"
732
+ },
733
+ "model.layers.27.self_attn.v_proj": {
734
+ "group_size": 64,
735
+ "bits": 8,
736
+ "mode": "affine"
737
+ },
738
+ "model.layers.27.self_attn.o_proj": {
739
+ "group_size": 64,
740
+ "bits": 8,
741
+ "mode": "affine"
742
+ },
743
+ "model.layers.27.mlp.router": {
744
+ "group_size": 64,
745
+ "bits": 8
746
+ },
747
+ "model.layers.28.self_attn.q_proj": {
748
+ "group_size": 64,
749
+ "bits": 8,
750
+ "mode": "affine"
751
+ },
752
+ "model.layers.28.self_attn.k_proj": {
753
+ "group_size": 64,
754
+ "bits": 8,
755
+ "mode": "affine"
756
+ },
757
+ "model.layers.28.self_attn.v_proj": {
758
+ "group_size": 64,
759
+ "bits": 8,
760
+ "mode": "affine"
761
+ },
762
+ "model.layers.28.self_attn.o_proj": {
763
+ "group_size": 64,
764
+ "bits": 8,
765
+ "mode": "affine"
766
+ },
767
+ "model.layers.28.mlp.router": {
768
+ "group_size": 64,
769
+ "bits": 8
770
+ },
771
+ "model.layers.29.self_attn.q_proj": {
772
+ "group_size": 64,
773
+ "bits": 8,
774
+ "mode": "affine"
775
+ },
776
+ "model.layers.29.self_attn.k_proj": {
777
+ "group_size": 64,
778
+ "bits": 8,
779
+ "mode": "affine"
780
+ },
781
+ "model.layers.29.self_attn.v_proj": {
782
+ "group_size": 64,
783
+ "bits": 8,
784
+ "mode": "affine"
785
+ },
786
+ "model.layers.29.self_attn.o_proj": {
787
+ "group_size": 64,
788
+ "bits": 8,
789
+ "mode": "affine"
790
+ },
791
+ "model.layers.29.mlp.router": {
792
+ "group_size": 64,
793
+ "bits": 8
794
+ },
795
+ "model.layers.30.self_attn.q_proj": {
796
+ "group_size": 64,
797
+ "bits": 8,
798
+ "mode": "affine"
799
+ },
800
+ "model.layers.30.self_attn.k_proj": {
801
+ "group_size": 64,
802
+ "bits": 8,
803
+ "mode": "affine"
804
+ },
805
+ "model.layers.30.self_attn.v_proj": {
806
+ "group_size": 64,
807
+ "bits": 8,
808
+ "mode": "affine"
809
+ },
810
+ "model.layers.30.self_attn.o_proj": {
811
+ "group_size": 64,
812
+ "bits": 8,
813
+ "mode": "affine"
814
+ },
815
+ "model.layers.30.mlp.router": {
816
+ "group_size": 64,
817
+ "bits": 8
818
+ },
819
+ "model.layers.31.self_attn.q_proj": {
820
+ "group_size": 64,
821
+ "bits": 8,
822
+ "mode": "affine"
823
+ },
824
+ "model.layers.31.self_attn.k_proj": {
825
+ "group_size": 64,
826
+ "bits": 8,
827
+ "mode": "affine"
828
+ },
829
+ "model.layers.31.self_attn.v_proj": {
830
+ "group_size": 64,
831
+ "bits": 8,
832
+ "mode": "affine"
833
+ },
834
+ "model.layers.31.self_attn.o_proj": {
835
+ "group_size": 64,
836
+ "bits": 8,
837
+ "mode": "affine"
838
+ },
839
+ "model.layers.31.mlp.router": {
840
+ "group_size": 64,
841
+ "bits": 8
842
+ },
843
+ "model.layers.32.self_attn.q_proj": {
844
+ "group_size": 64,
845
+ "bits": 8,
846
+ "mode": "affine"
847
+ },
848
+ "model.layers.32.self_attn.k_proj": {
849
+ "group_size": 64,
850
+ "bits": 8,
851
+ "mode": "affine"
852
+ },
853
+ "model.layers.32.self_attn.v_proj": {
854
+ "group_size": 64,
855
+ "bits": 8,
856
+ "mode": "affine"
857
+ },
858
+ "model.layers.32.self_attn.o_proj": {
859
+ "group_size": 64,
860
+ "bits": 8,
861
+ "mode": "affine"
862
+ },
863
+ "model.layers.32.mlp.router": {
864
+ "group_size": 64,
865
+ "bits": 8
866
+ },
867
+ "model.layers.33.self_attn.q_proj": {
868
+ "group_size": 64,
869
+ "bits": 8,
870
+ "mode": "affine"
871
+ },
872
+ "model.layers.33.self_attn.k_proj": {
873
+ "group_size": 64,
874
+ "bits": 8,
875
+ "mode": "affine"
876
+ },
877
+ "model.layers.33.self_attn.v_proj": {
878
+ "group_size": 64,
879
+ "bits": 8,
880
+ "mode": "affine"
881
+ },
882
+ "model.layers.33.self_attn.o_proj": {
883
+ "group_size": 64,
884
+ "bits": 8,
885
+ "mode": "affine"
886
+ },
887
+ "model.layers.33.mlp.router": {
888
+ "group_size": 64,
889
+ "bits": 8
890
+ },
891
+ "model.layers.34.self_attn.q_proj": {
892
+ "group_size": 64,
893
+ "bits": 8,
894
+ "mode": "affine"
895
+ },
896
+ "model.layers.34.self_attn.k_proj": {
897
+ "group_size": 64,
898
+ "bits": 8,
899
+ "mode": "affine"
900
+ },
901
+ "model.layers.34.self_attn.v_proj": {
902
+ "group_size": 64,
903
+ "bits": 8,
904
+ "mode": "affine"
905
+ },
906
+ "model.layers.34.self_attn.o_proj": {
907
+ "group_size": 64,
908
+ "bits": 8,
909
+ "mode": "affine"
910
+ },
911
+ "model.layers.34.mlp.router": {
912
+ "group_size": 64,
913
+ "bits": 8
914
+ },
915
+ "model.layers.35.self_attn.q_proj": {
916
+ "group_size": 64,
917
+ "bits": 8,
918
+ "mode": "affine"
919
+ },
920
+ "model.layers.35.self_attn.k_proj": {
921
+ "group_size": 64,
922
+ "bits": 8,
923
+ "mode": "affine"
924
+ },
925
+ "model.layers.35.self_attn.v_proj": {
926
+ "group_size": 64,
927
+ "bits": 8,
928
+ "mode": "affine"
929
+ },
930
+ "model.layers.35.self_attn.o_proj": {
931
+ "group_size": 64,
932
+ "bits": 8,
933
+ "mode": "affine"
934
+ },
935
+ "model.layers.35.mlp.router": {
936
+ "group_size": 64,
937
+ "bits": 8
938
+ },
939
+ "lm_head": {
940
+ "group_size": 64,
941
+ "bits": 8,
942
+ "mode": "affine"
943
+ }
944
+ },
945
+ "quantization_config": {
946
+ "group_size": 32,
947
+ "bits": 4,
948
+ "mode": "mxfp4",
949
+ "model.embed_tokens": {
950
+ "group_size": 64,
951
+ "bits": 8,
952
+ "mode": "affine"
953
+ },
954
+ "model.layers.0.self_attn.q_proj": {
955
+ "group_size": 64,
956
+ "bits": 8,
957
+ "mode": "affine"
958
+ },
959
+ "model.layers.0.self_attn.k_proj": {
960
+ "group_size": 64,
961
+ "bits": 8,
962
+ "mode": "affine"
963
+ },
964
+ "model.layers.0.self_attn.v_proj": {
965
+ "group_size": 64,
966
+ "bits": 8,
967
+ "mode": "affine"
968
+ },
969
+ "model.layers.0.self_attn.o_proj": {
970
+ "group_size": 64,
971
+ "bits": 8,
972
+ "mode": "affine"
973
+ },
974
+ "model.layers.0.mlp.router": {
975
+ "group_size": 64,
976
+ "bits": 8
977
+ },
978
+ "model.layers.1.self_attn.q_proj": {
979
+ "group_size": 64,
980
+ "bits": 8,
981
+ "mode": "affine"
982
+ },
983
+ "model.layers.1.self_attn.k_proj": {
984
+ "group_size": 64,
985
+ "bits": 8,
986
+ "mode": "affine"
987
+ },
988
+ "model.layers.1.self_attn.v_proj": {
989
+ "group_size": 64,
990
+ "bits": 8,
991
+ "mode": "affine"
992
+ },
993
+ "model.layers.1.self_attn.o_proj": {
994
+ "group_size": 64,
995
+ "bits": 8,
996
+ "mode": "affine"
997
+ },
998
+ "model.layers.1.mlp.router": {
999
+ "group_size": 64,
1000
+ "bits": 8
1001
+ },
1002
+ "model.layers.2.self_attn.q_proj": {
1003
+ "group_size": 64,
1004
+ "bits": 8,
1005
+ "mode": "affine"
1006
+ },
1007
+ "model.layers.2.self_attn.k_proj": {
1008
+ "group_size": 64,
1009
+ "bits": 8,
1010
+ "mode": "affine"
1011
+ },
1012
+ "model.layers.2.self_attn.v_proj": {
1013
+ "group_size": 64,
1014
+ "bits": 8,
1015
+ "mode": "affine"
1016
+ },
1017
+ "model.layers.2.self_attn.o_proj": {
1018
+ "group_size": 64,
1019
+ "bits": 8,
1020
+ "mode": "affine"
1021
+ },
1022
+ "model.layers.2.mlp.router": {
1023
+ "group_size": 64,
1024
+ "bits": 8
1025
+ },
1026
+ "model.layers.3.self_attn.q_proj": {
1027
+ "group_size": 64,
1028
+ "bits": 8,
1029
+ "mode": "affine"
1030
+ },
1031
+ "model.layers.3.self_attn.k_proj": {
1032
+ "group_size": 64,
1033
+ "bits": 8,
1034
+ "mode": "affine"
1035
+ },
1036
+ "model.layers.3.self_attn.v_proj": {
1037
+ "group_size": 64,
1038
+ "bits": 8,
1039
+ "mode": "affine"
1040
+ },
1041
+ "model.layers.3.self_attn.o_proj": {
1042
+ "group_size": 64,
1043
+ "bits": 8,
1044
+ "mode": "affine"
1045
+ },
1046
+ "model.layers.3.mlp.router": {
1047
+ "group_size": 64,
1048
+ "bits": 8
1049
+ },
1050
+ "model.layers.4.self_attn.q_proj": {
1051
+ "group_size": 64,
1052
+ "bits": 8,
1053
+ "mode": "affine"
1054
+ },
1055
+ "model.layers.4.self_attn.k_proj": {
1056
+ "group_size": 64,
1057
+ "bits": 8,
1058
+ "mode": "affine"
1059
+ },
1060
+ "model.layers.4.self_attn.v_proj": {
1061
+ "group_size": 64,
1062
+ "bits": 8,
1063
+ "mode": "affine"
1064
+ },
1065
+ "model.layers.4.self_attn.o_proj": {
1066
+ "group_size": 64,
1067
+ "bits": 8,
1068
+ "mode": "affine"
1069
+ },
1070
+ "model.layers.4.mlp.router": {
1071
+ "group_size": 64,
1072
+ "bits": 8
1073
+ },
1074
+ "model.layers.5.self_attn.q_proj": {
1075
+ "group_size": 64,
1076
+ "bits": 8,
1077
+ "mode": "affine"
1078
+ },
1079
+ "model.layers.5.self_attn.k_proj": {
1080
+ "group_size": 64,
1081
+ "bits": 8,
1082
+ "mode": "affine"
1083
+ },
1084
+ "model.layers.5.self_attn.v_proj": {
1085
+ "group_size": 64,
1086
+ "bits": 8,
1087
+ "mode": "affine"
1088
+ },
1089
+ "model.layers.5.self_attn.o_proj": {
1090
+ "group_size": 64,
1091
+ "bits": 8,
1092
+ "mode": "affine"
1093
+ },
1094
+ "model.layers.5.mlp.router": {
1095
+ "group_size": 64,
1096
+ "bits": 8
1097
+ },
1098
+ "model.layers.6.self_attn.q_proj": {
1099
+ "group_size": 64,
1100
+ "bits": 8,
1101
+ "mode": "affine"
1102
+ },
1103
+ "model.layers.6.self_attn.k_proj": {
1104
+ "group_size": 64,
1105
+ "bits": 8,
1106
+ "mode": "affine"
1107
+ },
1108
+ "model.layers.6.self_attn.v_proj": {
1109
+ "group_size": 64,
1110
+ "bits": 8,
1111
+ "mode": "affine"
1112
+ },
1113
+ "model.layers.6.self_attn.o_proj": {
1114
+ "group_size": 64,
1115
+ "bits": 8,
1116
+ "mode": "affine"
1117
+ },
1118
+ "model.layers.6.mlp.router": {
1119
+ "group_size": 64,
1120
+ "bits": 8
1121
+ },
1122
+ "model.layers.7.self_attn.q_proj": {
1123
+ "group_size": 64,
1124
+ "bits": 8,
1125
+ "mode": "affine"
1126
+ },
1127
+ "model.layers.7.self_attn.k_proj": {
1128
+ "group_size": 64,
1129
+ "bits": 8,
1130
+ "mode": "affine"
1131
+ },
1132
+ "model.layers.7.self_attn.v_proj": {
1133
+ "group_size": 64,
1134
+ "bits": 8,
1135
+ "mode": "affine"
1136
+ },
1137
+ "model.layers.7.self_attn.o_proj": {
1138
+ "group_size": 64,
1139
+ "bits": 8,
1140
+ "mode": "affine"
1141
+ },
1142
+ "model.layers.7.mlp.router": {
1143
+ "group_size": 64,
1144
+ "bits": 8
1145
+ },
1146
+ "model.layers.8.self_attn.q_proj": {
1147
+ "group_size": 64,
1148
+ "bits": 8,
1149
+ "mode": "affine"
1150
+ },
1151
+ "model.layers.8.self_attn.k_proj": {
1152
+ "group_size": 64,
1153
+ "bits": 8,
1154
+ "mode": "affine"
1155
+ },
1156
+ "model.layers.8.self_attn.v_proj": {
1157
+ "group_size": 64,
1158
+ "bits": 8,
1159
+ "mode": "affine"
1160
+ },
1161
+ "model.layers.8.self_attn.o_proj": {
1162
+ "group_size": 64,
1163
+ "bits": 8,
1164
+ "mode": "affine"
1165
+ },
1166
+ "model.layers.8.mlp.router": {
1167
+ "group_size": 64,
1168
+ "bits": 8
1169
+ },
1170
+ "model.layers.9.self_attn.q_proj": {
1171
+ "group_size": 64,
1172
+ "bits": 8,
1173
+ "mode": "affine"
1174
+ },
1175
+ "model.layers.9.self_attn.k_proj": {
1176
+ "group_size": 64,
1177
+ "bits": 8,
1178
+ "mode": "affine"
1179
+ },
1180
+ "model.layers.9.self_attn.v_proj": {
1181
+ "group_size": 64,
1182
+ "bits": 8,
1183
+ "mode": "affine"
1184
+ },
1185
+ "model.layers.9.self_attn.o_proj": {
1186
+ "group_size": 64,
1187
+ "bits": 8,
1188
+ "mode": "affine"
1189
+ },
1190
+ "model.layers.9.mlp.router": {
1191
+ "group_size": 64,
1192
+ "bits": 8
1193
+ },
1194
+ "model.layers.10.self_attn.q_proj": {
1195
+ "group_size": 64,
1196
+ "bits": 8,
1197
+ "mode": "affine"
1198
+ },
1199
+ "model.layers.10.self_attn.k_proj": {
1200
+ "group_size": 64,
1201
+ "bits": 8,
1202
+ "mode": "affine"
1203
+ },
1204
+ "model.layers.10.self_attn.v_proj": {
1205
+ "group_size": 64,
1206
+ "bits": 8,
1207
+ "mode": "affine"
1208
+ },
1209
+ "model.layers.10.self_attn.o_proj": {
1210
+ "group_size": 64,
1211
+ "bits": 8,
1212
+ "mode": "affine"
1213
+ },
1214
+ "model.layers.10.mlp.router": {
1215
+ "group_size": 64,
1216
+ "bits": 8
1217
+ },
1218
+ "model.layers.11.self_attn.q_proj": {
1219
+ "group_size": 64,
1220
+ "bits": 8,
1221
+ "mode": "affine"
1222
+ },
1223
+ "model.layers.11.self_attn.k_proj": {
1224
+ "group_size": 64,
1225
+ "bits": 8,
1226
+ "mode": "affine"
1227
+ },
1228
+ "model.layers.11.self_attn.v_proj": {
1229
+ "group_size": 64,
1230
+ "bits": 8,
1231
+ "mode": "affine"
1232
+ },
1233
+ "model.layers.11.self_attn.o_proj": {
1234
+ "group_size": 64,
1235
+ "bits": 8,
1236
+ "mode": "affine"
1237
+ },
1238
+ "model.layers.11.mlp.router": {
1239
+ "group_size": 64,
1240
+ "bits": 8
1241
+ },
1242
+ "model.layers.12.self_attn.q_proj": {
1243
+ "group_size": 64,
1244
+ "bits": 8,
1245
+ "mode": "affine"
1246
+ },
1247
+ "model.layers.12.self_attn.k_proj": {
1248
+ "group_size": 64,
1249
+ "bits": 8,
1250
+ "mode": "affine"
1251
+ },
1252
+ "model.layers.12.self_attn.v_proj": {
1253
+ "group_size": 64,
1254
+ "bits": 8,
1255
+ "mode": "affine"
1256
+ },
1257
+ "model.layers.12.self_attn.o_proj": {
1258
+ "group_size": 64,
1259
+ "bits": 8,
1260
+ "mode": "affine"
1261
+ },
1262
+ "model.layers.12.mlp.router": {
1263
+ "group_size": 64,
1264
+ "bits": 8
1265
+ },
1266
+ "model.layers.13.self_attn.q_proj": {
1267
+ "group_size": 64,
1268
+ "bits": 8,
1269
+ "mode": "affine"
1270
+ },
1271
+ "model.layers.13.self_attn.k_proj": {
1272
+ "group_size": 64,
1273
+ "bits": 8,
1274
+ "mode": "affine"
1275
+ },
1276
+ "model.layers.13.self_attn.v_proj": {
1277
+ "group_size": 64,
1278
+ "bits": 8,
1279
+ "mode": "affine"
1280
+ },
1281
+ "model.layers.13.self_attn.o_proj": {
1282
+ "group_size": 64,
1283
+ "bits": 8,
1284
+ "mode": "affine"
1285
+ },
1286
+ "model.layers.13.mlp.router": {
1287
+ "group_size": 64,
1288
+ "bits": 8
1289
+ },
1290
+ "model.layers.14.self_attn.q_proj": {
1291
+ "group_size": 64,
1292
+ "bits": 8,
1293
+ "mode": "affine"
1294
+ },
1295
+ "model.layers.14.self_attn.k_proj": {
1296
+ "group_size": 64,
1297
+ "bits": 8,
1298
+ "mode": "affine"
1299
+ },
1300
+ "model.layers.14.self_attn.v_proj": {
1301
+ "group_size": 64,
1302
+ "bits": 8,
1303
+ "mode": "affine"
1304
+ },
1305
+ "model.layers.14.self_attn.o_proj": {
1306
+ "group_size": 64,
1307
+ "bits": 8,
1308
+ "mode": "affine"
1309
+ },
1310
+ "model.layers.14.mlp.router": {
1311
+ "group_size": 64,
1312
+ "bits": 8
1313
+ },
1314
+ "model.layers.15.self_attn.q_proj": {
1315
+ "group_size": 64,
1316
+ "bits": 8,
1317
+ "mode": "affine"
1318
+ },
1319
+ "model.layers.15.self_attn.k_proj": {
1320
+ "group_size": 64,
1321
+ "bits": 8,
1322
+ "mode": "affine"
1323
+ },
1324
+ "model.layers.15.self_attn.v_proj": {
1325
+ "group_size": 64,
1326
+ "bits": 8,
1327
+ "mode": "affine"
1328
+ },
1329
+ "model.layers.15.self_attn.o_proj": {
1330
+ "group_size": 64,
1331
+ "bits": 8,
1332
+ "mode": "affine"
1333
+ },
1334
+ "model.layers.15.mlp.router": {
1335
+ "group_size": 64,
1336
+ "bits": 8
1337
+ },
1338
+ "model.layers.16.self_attn.q_proj": {
1339
+ "group_size": 64,
1340
+ "bits": 8,
1341
+ "mode": "affine"
1342
+ },
1343
+ "model.layers.16.self_attn.k_proj": {
1344
+ "group_size": 64,
1345
+ "bits": 8,
1346
+ "mode": "affine"
1347
+ },
1348
+ "model.layers.16.self_attn.v_proj": {
1349
+ "group_size": 64,
1350
+ "bits": 8,
1351
+ "mode": "affine"
1352
+ },
1353
+ "model.layers.16.self_attn.o_proj": {
1354
+ "group_size": 64,
1355
+ "bits": 8,
1356
+ "mode": "affine"
1357
+ },
1358
+ "model.layers.16.mlp.router": {
1359
+ "group_size": 64,
1360
+ "bits": 8
1361
+ },
1362
+ "model.layers.17.self_attn.q_proj": {
1363
+ "group_size": 64,
1364
+ "bits": 8,
1365
+ "mode": "affine"
1366
+ },
1367
+ "model.layers.17.self_attn.k_proj": {
1368
+ "group_size": 64,
1369
+ "bits": 8,
1370
+ "mode": "affine"
1371
+ },
1372
+ "model.layers.17.self_attn.v_proj": {
1373
+ "group_size": 64,
1374
+ "bits": 8,
1375
+ "mode": "affine"
1376
+ },
1377
+ "model.layers.17.self_attn.o_proj": {
1378
+ "group_size": 64,
1379
+ "bits": 8,
1380
+ "mode": "affine"
1381
+ },
1382
+ "model.layers.17.mlp.router": {
1383
+ "group_size": 64,
1384
+ "bits": 8
1385
+ },
1386
+ "model.layers.18.self_attn.q_proj": {
1387
+ "group_size": 64,
1388
+ "bits": 8,
1389
+ "mode": "affine"
1390
+ },
1391
+ "model.layers.18.self_attn.k_proj": {
1392
+ "group_size": 64,
1393
+ "bits": 8,
1394
+ "mode": "affine"
1395
+ },
1396
+ "model.layers.18.self_attn.v_proj": {
1397
+ "group_size": 64,
1398
+ "bits": 8,
1399
+ "mode": "affine"
1400
+ },
1401
+ "model.layers.18.self_attn.o_proj": {
1402
+ "group_size": 64,
1403
+ "bits": 8,
1404
+ "mode": "affine"
1405
+ },
1406
+ "model.layers.18.mlp.router": {
1407
+ "group_size": 64,
1408
+ "bits": 8
1409
+ },
1410
+ "model.layers.19.self_attn.q_proj": {
1411
+ "group_size": 64,
1412
+ "bits": 8,
1413
+ "mode": "affine"
1414
+ },
1415
+ "model.layers.19.self_attn.k_proj": {
1416
+ "group_size": 64,
1417
+ "bits": 8,
1418
+ "mode": "affine"
1419
+ },
1420
+ "model.layers.19.self_attn.v_proj": {
1421
+ "group_size": 64,
1422
+ "bits": 8,
1423
+ "mode": "affine"
1424
+ },
1425
+ "model.layers.19.self_attn.o_proj": {
1426
+ "group_size": 64,
1427
+ "bits": 8,
1428
+ "mode": "affine"
1429
+ },
1430
+ "model.layers.19.mlp.router": {
1431
+ "group_size": 64,
1432
+ "bits": 8
1433
+ },
1434
+ "model.layers.20.self_attn.q_proj": {
1435
+ "group_size": 64,
1436
+ "bits": 8,
1437
+ "mode": "affine"
1438
+ },
1439
+ "model.layers.20.self_attn.k_proj": {
1440
+ "group_size": 64,
1441
+ "bits": 8,
1442
+ "mode": "affine"
1443
+ },
1444
+ "model.layers.20.self_attn.v_proj": {
1445
+ "group_size": 64,
1446
+ "bits": 8,
1447
+ "mode": "affine"
1448
+ },
1449
+ "model.layers.20.self_attn.o_proj": {
1450
+ "group_size": 64,
1451
+ "bits": 8,
1452
+ "mode": "affine"
1453
+ },
1454
+ "model.layers.20.mlp.router": {
1455
+ "group_size": 64,
1456
+ "bits": 8
1457
+ },
1458
+ "model.layers.21.self_attn.q_proj": {
1459
+ "group_size": 64,
1460
+ "bits": 8,
1461
+ "mode": "affine"
1462
+ },
1463
+ "model.layers.21.self_attn.k_proj": {
1464
+ "group_size": 64,
1465
+ "bits": 8,
1466
+ "mode": "affine"
1467
+ },
1468
+ "model.layers.21.self_attn.v_proj": {
1469
+ "group_size": 64,
1470
+ "bits": 8,
1471
+ "mode": "affine"
1472
+ },
1473
+ "model.layers.21.self_attn.o_proj": {
1474
+ "group_size": 64,
1475
+ "bits": 8,
1476
+ "mode": "affine"
1477
+ },
1478
+ "model.layers.21.mlp.router": {
1479
+ "group_size": 64,
1480
+ "bits": 8
1481
+ },
1482
+ "model.layers.22.self_attn.q_proj": {
1483
+ "group_size": 64,
1484
+ "bits": 8,
1485
+ "mode": "affine"
1486
+ },
1487
+ "model.layers.22.self_attn.k_proj": {
1488
+ "group_size": 64,
1489
+ "bits": 8,
1490
+ "mode": "affine"
1491
+ },
1492
+ "model.layers.22.self_attn.v_proj": {
1493
+ "group_size": 64,
1494
+ "bits": 8,
1495
+ "mode": "affine"
1496
+ },
1497
+ "model.layers.22.self_attn.o_proj": {
1498
+ "group_size": 64,
1499
+ "bits": 8,
1500
+ "mode": "affine"
1501
+ },
1502
+ "model.layers.22.mlp.router": {
1503
+ "group_size": 64,
1504
+ "bits": 8
1505
+ },
1506
+ "model.layers.23.self_attn.q_proj": {
1507
+ "group_size": 64,
1508
+ "bits": 8,
1509
+ "mode": "affine"
1510
+ },
1511
+ "model.layers.23.self_attn.k_proj": {
1512
+ "group_size": 64,
1513
+ "bits": 8,
1514
+ "mode": "affine"
1515
+ },
1516
+ "model.layers.23.self_attn.v_proj": {
1517
+ "group_size": 64,
1518
+ "bits": 8,
1519
+ "mode": "affine"
1520
+ },
1521
+ "model.layers.23.self_attn.o_proj": {
1522
+ "group_size": 64,
1523
+ "bits": 8,
1524
+ "mode": "affine"
1525
+ },
1526
+ "model.layers.23.mlp.router": {
1527
+ "group_size": 64,
1528
+ "bits": 8
1529
+ },
1530
+ "model.layers.24.self_attn.q_proj": {
1531
+ "group_size": 64,
1532
+ "bits": 8,
1533
+ "mode": "affine"
1534
+ },
1535
+ "model.layers.24.self_attn.k_proj": {
1536
+ "group_size": 64,
1537
+ "bits": 8,
1538
+ "mode": "affine"
1539
+ },
1540
+ "model.layers.24.self_attn.v_proj": {
1541
+ "group_size": 64,
1542
+ "bits": 8,
1543
+ "mode": "affine"
1544
+ },
1545
+ "model.layers.24.self_attn.o_proj": {
1546
+ "group_size": 64,
1547
+ "bits": 8,
1548
+ "mode": "affine"
1549
+ },
1550
+ "model.layers.24.mlp.router": {
1551
+ "group_size": 64,
1552
+ "bits": 8
1553
+ },
1554
+ "model.layers.25.self_attn.q_proj": {
1555
+ "group_size": 64,
1556
+ "bits": 8,
1557
+ "mode": "affine"
1558
+ },
1559
+ "model.layers.25.self_attn.k_proj": {
1560
+ "group_size": 64,
1561
+ "bits": 8,
1562
+ "mode": "affine"
1563
+ },
1564
+ "model.layers.25.self_attn.v_proj": {
1565
+ "group_size": 64,
1566
+ "bits": 8,
1567
+ "mode": "affine"
1568
+ },
1569
+ "model.layers.25.self_attn.o_proj": {
1570
+ "group_size": 64,
1571
+ "bits": 8,
1572
+ "mode": "affine"
1573
+ },
1574
+ "model.layers.25.mlp.router": {
1575
+ "group_size": 64,
1576
+ "bits": 8
1577
+ },
1578
+ "model.layers.26.self_attn.q_proj": {
1579
+ "group_size": 64,
1580
+ "bits": 8,
1581
+ "mode": "affine"
1582
+ },
1583
+ "model.layers.26.self_attn.k_proj": {
1584
+ "group_size": 64,
1585
+ "bits": 8,
1586
+ "mode": "affine"
1587
+ },
1588
+ "model.layers.26.self_attn.v_proj": {
1589
+ "group_size": 64,
1590
+ "bits": 8,
1591
+ "mode": "affine"
1592
+ },
1593
+ "model.layers.26.self_attn.o_proj": {
1594
+ "group_size": 64,
1595
+ "bits": 8,
1596
+ "mode": "affine"
1597
+ },
1598
+ "model.layers.26.mlp.router": {
1599
+ "group_size": 64,
1600
+ "bits": 8
1601
+ },
1602
+ "model.layers.27.self_attn.q_proj": {
1603
+ "group_size": 64,
1604
+ "bits": 8,
1605
+ "mode": "affine"
1606
+ },
1607
+ "model.layers.27.self_attn.k_proj": {
1608
+ "group_size": 64,
1609
+ "bits": 8,
1610
+ "mode": "affine"
1611
+ },
1612
+ "model.layers.27.self_attn.v_proj": {
1613
+ "group_size": 64,
1614
+ "bits": 8,
1615
+ "mode": "affine"
1616
+ },
1617
+ "model.layers.27.self_attn.o_proj": {
1618
+ "group_size": 64,
1619
+ "bits": 8,
1620
+ "mode": "affine"
1621
+ },
1622
+ "model.layers.27.mlp.router": {
1623
+ "group_size": 64,
1624
+ "bits": 8
1625
+ },
1626
+ "model.layers.28.self_attn.q_proj": {
1627
+ "group_size": 64,
1628
+ "bits": 8,
1629
+ "mode": "affine"
1630
+ },
1631
+ "model.layers.28.self_attn.k_proj": {
1632
+ "group_size": 64,
1633
+ "bits": 8,
1634
+ "mode": "affine"
1635
+ },
1636
+ "model.layers.28.self_attn.v_proj": {
1637
+ "group_size": 64,
1638
+ "bits": 8,
1639
+ "mode": "affine"
1640
+ },
1641
+ "model.layers.28.self_attn.o_proj": {
1642
+ "group_size": 64,
1643
+ "bits": 8,
1644
+ "mode": "affine"
1645
+ },
1646
+ "model.layers.28.mlp.router": {
1647
+ "group_size": 64,
1648
+ "bits": 8
1649
+ },
1650
+ "model.layers.29.self_attn.q_proj": {
1651
+ "group_size": 64,
1652
+ "bits": 8,
1653
+ "mode": "affine"
1654
+ },
1655
+ "model.layers.29.self_attn.k_proj": {
1656
+ "group_size": 64,
1657
+ "bits": 8,
1658
+ "mode": "affine"
1659
+ },
1660
+ "model.layers.29.self_attn.v_proj": {
1661
+ "group_size": 64,
1662
+ "bits": 8,
1663
+ "mode": "affine"
1664
+ },
1665
+ "model.layers.29.self_attn.o_proj": {
1666
+ "group_size": 64,
1667
+ "bits": 8,
1668
+ "mode": "affine"
1669
+ },
1670
+ "model.layers.29.mlp.router": {
1671
+ "group_size": 64,
1672
+ "bits": 8
1673
+ },
1674
+ "model.layers.30.self_attn.q_proj": {
1675
+ "group_size": 64,
1676
+ "bits": 8,
1677
+ "mode": "affine"
1678
+ },
1679
+ "model.layers.30.self_attn.k_proj": {
1680
+ "group_size": 64,
1681
+ "bits": 8,
1682
+ "mode": "affine"
1683
+ },
1684
+ "model.layers.30.self_attn.v_proj": {
1685
+ "group_size": 64,
1686
+ "bits": 8,
1687
+ "mode": "affine"
1688
+ },
1689
+ "model.layers.30.self_attn.o_proj": {
1690
+ "group_size": 64,
1691
+ "bits": 8,
1692
+ "mode": "affine"
1693
+ },
1694
+ "model.layers.30.mlp.router": {
1695
+ "group_size": 64,
1696
+ "bits": 8
1697
+ },
1698
+ "model.layers.31.self_attn.q_proj": {
1699
+ "group_size": 64,
1700
+ "bits": 8,
1701
+ "mode": "affine"
1702
+ },
1703
+ "model.layers.31.self_attn.k_proj": {
1704
+ "group_size": 64,
1705
+ "bits": 8,
1706
+ "mode": "affine"
1707
+ },
1708
+ "model.layers.31.self_attn.v_proj": {
1709
+ "group_size": 64,
1710
+ "bits": 8,
1711
+ "mode": "affine"
1712
+ },
1713
+ "model.layers.31.self_attn.o_proj": {
1714
+ "group_size": 64,
1715
+ "bits": 8,
1716
+ "mode": "affine"
1717
+ },
1718
+ "model.layers.31.mlp.router": {
1719
+ "group_size": 64,
1720
+ "bits": 8
1721
+ },
1722
+ "model.layers.32.self_attn.q_proj": {
1723
+ "group_size": 64,
1724
+ "bits": 8,
1725
+ "mode": "affine"
1726
+ },
1727
+ "model.layers.32.self_attn.k_proj": {
1728
+ "group_size": 64,
1729
+ "bits": 8,
1730
+ "mode": "affine"
1731
+ },
1732
+ "model.layers.32.self_attn.v_proj": {
1733
+ "group_size": 64,
1734
+ "bits": 8,
1735
+ "mode": "affine"
1736
+ },
1737
+ "model.layers.32.self_attn.o_proj": {
1738
+ "group_size": 64,
1739
+ "bits": 8,
1740
+ "mode": "affine"
1741
+ },
1742
+ "model.layers.32.mlp.router": {
1743
+ "group_size": 64,
1744
+ "bits": 8
1745
+ },
1746
+ "model.layers.33.self_attn.q_proj": {
1747
+ "group_size": 64,
1748
+ "bits": 8,
1749
+ "mode": "affine"
1750
+ },
1751
+ "model.layers.33.self_attn.k_proj": {
1752
+ "group_size": 64,
1753
+ "bits": 8,
1754
+ "mode": "affine"
1755
+ },
1756
+ "model.layers.33.self_attn.v_proj": {
1757
+ "group_size": 64,
1758
+ "bits": 8,
1759
+ "mode": "affine"
1760
+ },
1761
+ "model.layers.33.self_attn.o_proj": {
1762
+ "group_size": 64,
1763
+ "bits": 8,
1764
+ "mode": "affine"
1765
+ },
1766
+ "model.layers.33.mlp.router": {
1767
+ "group_size": 64,
1768
+ "bits": 8
1769
+ },
1770
+ "model.layers.34.self_attn.q_proj": {
1771
+ "group_size": 64,
1772
+ "bits": 8,
1773
+ "mode": "affine"
1774
+ },
1775
+ "model.layers.34.self_attn.k_proj": {
1776
+ "group_size": 64,
1777
+ "bits": 8,
1778
+ "mode": "affine"
1779
+ },
1780
+ "model.layers.34.self_attn.v_proj": {
1781
+ "group_size": 64,
1782
+ "bits": 8,
1783
+ "mode": "affine"
1784
+ },
1785
+ "model.layers.34.self_attn.o_proj": {
1786
+ "group_size": 64,
1787
+ "bits": 8,
1788
+ "mode": "affine"
1789
+ },
1790
+ "model.layers.34.mlp.router": {
1791
+ "group_size": 64,
1792
+ "bits": 8
1793
+ },
1794
+ "model.layers.35.self_attn.q_proj": {
1795
+ "group_size": 64,
1796
+ "bits": 8,
1797
+ "mode": "affine"
1798
+ },
1799
+ "model.layers.35.self_attn.k_proj": {
1800
+ "group_size": 64,
1801
+ "bits": 8,
1802
+ "mode": "affine"
1803
+ },
1804
+ "model.layers.35.self_attn.v_proj": {
1805
+ "group_size": 64,
1806
+ "bits": 8,
1807
+ "mode": "affine"
1808
+ },
1809
+ "model.layers.35.self_attn.o_proj": {
1810
+ "group_size": 64,
1811
+ "bits": 8,
1812
+ "mode": "affine"
1813
+ },
1814
+ "model.layers.35.mlp.router": {
1815
+ "group_size": 64,
1816
+ "bits": 8
1817
+ },
1818
+ "lm_head": {
1819
+ "group_size": 64,
1820
+ "bits": 8,
1821
+ "mode": "affine"
1822
+ }
1823
+ },
1824
+ "rms_norm_eps": 1e-05,
1825
+ "rope_scaling": {
1826
+ "beta_fast": 32.0,
1827
+ "beta_slow": 1.0,
1828
+ "factor": 32.0,
1829
+ "original_max_position_embeddings": 4096,
1830
+ "rope_type": "yarn",
1831
+ "truncate": false
1832
+ },
1833
+ "rope_theta": 150000,
1834
+ "router_aux_loss_coef": 0.9,
1835
+ "sliding_window": 128,
1836
+ "swiglu_limit": 7.0,
1837
+ "tie_word_embeddings": false,
1838
+ "transformers_version": "4.55.0.dev0",
1839
+ "use_cache": true,
1840
+ "vocab_size": 201088
1841
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 199998,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 200002,
6
+ 199999,
7
+ 200012
8
+ ],
9
+ "pad_token_id": 199999,
10
+ "transformers_version": "4.55.0.dev0"
11
+ }
model-00001-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95d8260eab571297cbbaa64e4e4bb592cebe12b8d5c909a795a8700f8567197c
3
+ size 5218855420
model-00002-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc34b600b8d685b75641fb11c15a80a5fb082b3a4c380beec73e5cd8ac63b404
3
+ size 5168680960
model-00003-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c017774dab5349f6195b0d37f197810c405131532b59856b5e04f6466f6a4fd
3
+ size 5168680922
model-00004-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e99bc06f575d97d0d2de77a7b424c50510f35017d97b43ade57734c534defff
3
+ size 5168680925
model-00005-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c26e8a78937520dfe6d1536866cbd459b2ba37e4c2221666af3033e401cca0
3
+ size 5168681018
model-00006-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb37c0c53c7fce7c0c42bd1db06ebd949f9fc8df3ec226c0f7f46dab83ac6c61
3
+ size 5168680976
model-00007-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1cf271bd613cd15721819b6cb7959dcfe28ee8d55b48d7beb5b89d0e09baa4c
3
+ size 5168681032
model-00008-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:877bfcf90e6abe6548a7d4e299fa0f6ae94d9a8f36885fb90455e21201c29adb
3
+ size 5168680982
model-00009-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d8255c3e5d213bf2734c9e865b8731e6ea6189f4b27efd86cf554169cbb1c34
3
+ size 5168681028
model-00010-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bed6083b95f5a01dcac9dd8b0329126866e9259e72468643986c8e5a26c009ac
3
+ size 5168681056
model-00011-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df1e87b277adf5f3d17b38426fb4867c8c243d82c1f1a9325e0d479db557b90c
3
+ size 5168681064
model-00012-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:841ee8b3487ad01324b28ef083e79a2f942a4e8d1c06cdde314fcb9e754b3350
3
+ size 5168681028
model-00013-of-00013.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9b1b135a69a4de118e7037ad49c3c3cdf53e528697e40c8ddb3ca50b16edbd1
3
+ size 1180490533
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
3
+ size 27868174
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|return|>",
6
+ "is_local": true,
7
+ "model_input_names": [
8
+ "input_ids",
9
+ "attention_mask"
10
+ ],
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "pad_token": "<|endoftext|>",
13
+ "tokenizer_class": "TokenizersBackend"
14
+ }