linzhao-amd commited on
Commit
2f8a9cc
·
verified ·
1 Parent(s): 664721e

upload moe-MXFP4 kv-fp8 model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. chat_template.jinja +86 -0
  3. config.json +888 -0
  4. generation_config.json +10 -0
  5. model-00001-of-00043.safetensors +3 -0
  6. model-00002-of-00043.safetensors +3 -0
  7. model-00003-of-00043.safetensors +3 -0
  8. model-00004-of-00043.safetensors +3 -0
  9. model-00005-of-00043.safetensors +3 -0
  10. model-00006-of-00043.safetensors +3 -0
  11. model-00007-of-00043.safetensors +3 -0
  12. model-00008-of-00043.safetensors +3 -0
  13. model-00009-of-00043.safetensors +3 -0
  14. model-00010-of-00043.safetensors +3 -0
  15. model-00011-of-00043.safetensors +3 -0
  16. model-00012-of-00043.safetensors +3 -0
  17. model-00013-of-00043.safetensors +3 -0
  18. model-00014-of-00043.safetensors +3 -0
  19. model-00015-of-00043.safetensors +3 -0
  20. model-00016-of-00043.safetensors +3 -0
  21. model-00017-of-00043.safetensors +3 -0
  22. model-00018-of-00043.safetensors +3 -0
  23. model-00019-of-00043.safetensors +3 -0
  24. model-00020-of-00043.safetensors +3 -0
  25. model-00021-of-00043.safetensors +3 -0
  26. model-00022-of-00043.safetensors +3 -0
  27. model-00023-of-00043.safetensors +3 -0
  28. model-00024-of-00043.safetensors +3 -0
  29. model-00025-of-00043.safetensors +3 -0
  30. model-00026-of-00043.safetensors +3 -0
  31. model-00027-of-00043.safetensors +3 -0
  32. model-00028-of-00043.safetensors +3 -0
  33. model-00029-of-00043.safetensors +3 -0
  34. model-00030-of-00043.safetensors +3 -0
  35. model-00031-of-00043.safetensors +3 -0
  36. model-00032-of-00043.safetensors +3 -0
  37. model-00033-of-00043.safetensors +3 -0
  38. model-00034-of-00043.safetensors +3 -0
  39. model-00035-of-00043.safetensors +3 -0
  40. model-00036-of-00043.safetensors +3 -0
  41. model-00037-of-00043.safetensors +3 -0
  42. model-00038-of-00043.safetensors +3 -0
  43. model-00039-of-00043.safetensors +3 -0
  44. model-00040-of-00043.safetensors +3 -0
  45. model-00041-of-00043.safetensors +3 -0
  46. model-00042-of-00043.safetensors +3 -0
  47. model-00043-of-00043.safetensors +3 -0
  48. model.safetensors.index.json +0 -0
  49. special_tokens_map.json +34 -0
  50. tokenizer.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>{%- endif -%}
17
+ {%- macro visible_text(content) -%}
18
+ {%- if content is string -%}
19
+ {{- content }}
20
+ {%- elif content is iterable and content is not mapping -%}
21
+ {%- for item in content -%}
22
+ {%- if item is mapping and item.type == 'text' -%}
23
+ {{- item.text }}
24
+ {%- elif item is string -%}
25
+ {{- item }}
26
+ {%- endif -%}
27
+ {%- endfor -%}
28
+ {%- else -%}
29
+ {{- content }}
30
+ {%- endif -%}
31
+ {%- endmacro -%}
32
+ {%- set ns = namespace(last_user_index=-1) %}
33
+ {%- for m in messages %}
34
+ {%- if m.role == 'user' %}
35
+ {% set ns.last_user_index = loop.index0 -%}
36
+ {%- endif %}
37
+ {%- endfor %}
38
+ {% for m in messages %}
39
+ {%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}
40
+ {%- elif m.role == 'assistant' -%}
41
+ <|assistant|>
42
+ {%- set reasoning_content = '' %}
43
+ {%- set content = visible_text(m.content) %}
44
+ {%- if m.reasoning_content is string %}
45
+ {%- set reasoning_content = m.reasoning_content %}
46
+ {%- else %}
47
+ {%- if '</think>' in content %}
48
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
49
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
50
+ {%- endif %}
51
+ {%- endif %}
52
+ {%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%}
53
+ {{ '<think>' + reasoning_content.strip() + '</think>'}}
54
+ {%- else -%}
55
+ {{ '</think>' }}
56
+ {%- endif -%}
57
+ {%- if content.strip() -%}
58
+ {{ content.strip() }}
59
+ {%- endif -%}
60
+ {% if m.tool_calls %}
61
+ {% for tc in m.tool_calls %}
62
+ {%- if tc.function %}
63
+ {%- set tc = tc.function %}
64
+ {%- endif %}
65
+ {{- '<tool_call>' + tc.name -}}
66
+ {% set _args = tc.arguments %}{% for k, v in _args.items() %}<arg_key>{{ k }}</arg_key><arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>{% endfor %}</tool_call>{% endfor %}
67
+ {% endif %}
68
+ {%- elif m.role == 'tool' -%}
69
+ {%- if m.content is string -%}
70
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
71
+ {{- '<|observation|>' }}
72
+ {%- endif %}
73
+ {{- '<tool_response>' }}
74
+ {{- m.content }}
75
+ {{- '</tool_response>' }}
76
+ {%- else -%}
77
+ <|observation|>{% for tr in m.content %}
78
+ <tool_response>{{ tr.output if tr.output is defined else tr }}</tool_response>{% endfor -%}
79
+ {% endif -%}
80
+ {%- elif m.role == 'system' -%}
81
+ <|system|>{{ visible_text(m.content) }}
82
+ {%- endif -%}
83
+ {%- endfor -%}
84
+ {%- if add_generation_prompt -%}
85
+ <|assistant|>{{- '</think>' if (enable_thinking is defined and not enable_thinking) else '<think>' -}}
86
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,888 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4MoeForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": [
9
+ 151329,
10
+ 151336,
11
+ 151338
12
+ ],
13
+ "first_k_dense_replace": 3,
14
+ "head_dim": 128,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 5120,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 12288,
19
+ "max_position_embeddings": 202752,
20
+ "model_type": "glm4_moe",
21
+ "moe_intermediate_size": 1536,
22
+ "n_group": 1,
23
+ "n_routed_experts": 160,
24
+ "n_shared_experts": 1,
25
+ "norm_topk_prob": true,
26
+ "num_attention_heads": 96,
27
+ "num_experts_per_tok": 8,
28
+ "num_hidden_layers": 92,
29
+ "num_key_value_heads": 8,
30
+ "num_nextn_predict_layers": 1,
31
+ "pad_token_id": 151329,
32
+ "partial_rotary_factor": 0.5,
33
+ "quantization_config": {
34
+ "algo_config": null,
35
+ "exclude": [
36
+ "model.layers.0.self_attn.q_proj",
37
+ "model.layers.0.self_attn.k_proj",
38
+ "model.layers.0.self_attn.v_proj",
39
+ "model.layers.0.self_attn.o_proj",
40
+ "model.layers.0.mlp.gate_proj",
41
+ "model.layers.0.mlp.up_proj",
42
+ "model.layers.0.mlp.down_proj",
43
+ "model.layers.1.self_attn.q_proj",
44
+ "model.layers.1.self_attn.k_proj",
45
+ "model.layers.1.self_attn.v_proj",
46
+ "model.layers.1.self_attn.o_proj",
47
+ "model.layers.1.mlp.gate_proj",
48
+ "model.layers.1.mlp.up_proj",
49
+ "model.layers.1.mlp.down_proj",
50
+ "model.layers.2.self_attn.q_proj",
51
+ "model.layers.2.self_attn.k_proj",
52
+ "model.layers.2.self_attn.v_proj",
53
+ "model.layers.2.self_attn.o_proj",
54
+ "model.layers.2.mlp.gate_proj",
55
+ "model.layers.2.mlp.up_proj",
56
+ "model.layers.2.mlp.down_proj",
57
+ "model.layers.3.self_attn.q_proj",
58
+ "model.layers.3.self_attn.k_proj",
59
+ "model.layers.3.self_attn.v_proj",
60
+ "model.layers.3.self_attn.o_proj",
61
+ "model.layers.3.mlp.shared_experts.gate_proj",
62
+ "model.layers.3.mlp.shared_experts.up_proj",
63
+ "model.layers.3.mlp.shared_experts.down_proj",
64
+ "model.layers.4.self_attn.q_proj",
65
+ "model.layers.4.self_attn.k_proj",
66
+ "model.layers.4.self_attn.v_proj",
67
+ "model.layers.4.self_attn.o_proj",
68
+ "model.layers.4.mlp.shared_experts.gate_proj",
69
+ "model.layers.4.mlp.shared_experts.up_proj",
70
+ "model.layers.4.mlp.shared_experts.down_proj",
71
+ "model.layers.5.self_attn.q_proj",
72
+ "model.layers.5.self_attn.k_proj",
73
+ "model.layers.5.self_attn.v_proj",
74
+ "model.layers.5.self_attn.o_proj",
75
+ "model.layers.5.mlp.shared_experts.gate_proj",
76
+ "model.layers.5.mlp.shared_experts.up_proj",
77
+ "model.layers.5.mlp.shared_experts.down_proj",
78
+ "model.layers.6.self_attn.q_proj",
79
+ "model.layers.6.self_attn.k_proj",
80
+ "model.layers.6.self_attn.v_proj",
81
+ "model.layers.6.self_attn.o_proj",
82
+ "model.layers.6.mlp.shared_experts.gate_proj",
83
+ "model.layers.6.mlp.shared_experts.up_proj",
84
+ "model.layers.6.mlp.shared_experts.down_proj",
85
+ "model.layers.7.self_attn.q_proj",
86
+ "model.layers.7.self_attn.k_proj",
87
+ "model.layers.7.self_attn.v_proj",
88
+ "model.layers.7.self_attn.o_proj",
89
+ "model.layers.7.mlp.shared_experts.gate_proj",
90
+ "model.layers.7.mlp.shared_experts.up_proj",
91
+ "model.layers.7.mlp.shared_experts.down_proj",
92
+ "model.layers.8.self_attn.q_proj",
93
+ "model.layers.8.self_attn.k_proj",
94
+ "model.layers.8.self_attn.v_proj",
95
+ "model.layers.8.self_attn.o_proj",
96
+ "model.layers.8.mlp.shared_experts.gate_proj",
97
+ "model.layers.8.mlp.shared_experts.up_proj",
98
+ "model.layers.8.mlp.shared_experts.down_proj",
99
+ "model.layers.9.self_attn.q_proj",
100
+ "model.layers.9.self_attn.k_proj",
101
+ "model.layers.9.self_attn.v_proj",
102
+ "model.layers.9.self_attn.o_proj",
103
+ "model.layers.9.mlp.shared_experts.gate_proj",
104
+ "model.layers.9.mlp.shared_experts.up_proj",
105
+ "model.layers.9.mlp.shared_experts.down_proj",
106
+ "model.layers.10.self_attn.q_proj",
107
+ "model.layers.10.self_attn.k_proj",
108
+ "model.layers.10.self_attn.v_proj",
109
+ "model.layers.10.self_attn.o_proj",
110
+ "model.layers.10.mlp.shared_experts.gate_proj",
111
+ "model.layers.10.mlp.shared_experts.up_proj",
112
+ "model.layers.10.mlp.shared_experts.down_proj",
113
+ "model.layers.11.self_attn.q_proj",
114
+ "model.layers.11.self_attn.k_proj",
115
+ "model.layers.11.self_attn.v_proj",
116
+ "model.layers.11.self_attn.o_proj",
117
+ "model.layers.11.mlp.shared_experts.gate_proj",
118
+ "model.layers.11.mlp.shared_experts.up_proj",
119
+ "model.layers.11.mlp.shared_experts.down_proj",
120
+ "model.layers.12.self_attn.q_proj",
121
+ "model.layers.12.self_attn.k_proj",
122
+ "model.layers.12.self_attn.v_proj",
123
+ "model.layers.12.self_attn.o_proj",
124
+ "model.layers.12.mlp.shared_experts.gate_proj",
125
+ "model.layers.12.mlp.shared_experts.up_proj",
126
+ "model.layers.12.mlp.shared_experts.down_proj",
127
+ "model.layers.13.self_attn.q_proj",
128
+ "model.layers.13.self_attn.k_proj",
129
+ "model.layers.13.self_attn.v_proj",
130
+ "model.layers.13.self_attn.o_proj",
131
+ "model.layers.13.mlp.shared_experts.gate_proj",
132
+ "model.layers.13.mlp.shared_experts.up_proj",
133
+ "model.layers.13.mlp.shared_experts.down_proj",
134
+ "model.layers.14.self_attn.q_proj",
135
+ "model.layers.14.self_attn.k_proj",
136
+ "model.layers.14.self_attn.v_proj",
137
+ "model.layers.14.self_attn.o_proj",
138
+ "model.layers.14.mlp.shared_experts.gate_proj",
139
+ "model.layers.14.mlp.shared_experts.up_proj",
140
+ "model.layers.14.mlp.shared_experts.down_proj",
141
+ "model.layers.15.self_attn.q_proj",
142
+ "model.layers.15.self_attn.k_proj",
143
+ "model.layers.15.self_attn.v_proj",
144
+ "model.layers.15.self_attn.o_proj",
145
+ "model.layers.15.mlp.shared_experts.gate_proj",
146
+ "model.layers.15.mlp.shared_experts.up_proj",
147
+ "model.layers.15.mlp.shared_experts.down_proj",
148
+ "model.layers.16.self_attn.q_proj",
149
+ "model.layers.16.self_attn.k_proj",
150
+ "model.layers.16.self_attn.v_proj",
151
+ "model.layers.16.self_attn.o_proj",
152
+ "model.layers.16.mlp.shared_experts.gate_proj",
153
+ "model.layers.16.mlp.shared_experts.up_proj",
154
+ "model.layers.16.mlp.shared_experts.down_proj",
155
+ "model.layers.17.self_attn.q_proj",
156
+ "model.layers.17.self_attn.k_proj",
157
+ "model.layers.17.self_attn.v_proj",
158
+ "model.layers.17.self_attn.o_proj",
159
+ "model.layers.17.mlp.shared_experts.gate_proj",
160
+ "model.layers.17.mlp.shared_experts.up_proj",
161
+ "model.layers.17.mlp.shared_experts.down_proj",
162
+ "model.layers.18.self_attn.q_proj",
163
+ "model.layers.18.self_attn.k_proj",
164
+ "model.layers.18.self_attn.v_proj",
165
+ "model.layers.18.self_attn.o_proj",
166
+ "model.layers.18.mlp.shared_experts.gate_proj",
167
+ "model.layers.18.mlp.shared_experts.up_proj",
168
+ "model.layers.18.mlp.shared_experts.down_proj",
169
+ "model.layers.19.self_attn.q_proj",
170
+ "model.layers.19.self_attn.k_proj",
171
+ "model.layers.19.self_attn.v_proj",
172
+ "model.layers.19.self_attn.o_proj",
173
+ "model.layers.19.mlp.shared_experts.gate_proj",
174
+ "model.layers.19.mlp.shared_experts.up_proj",
175
+ "model.layers.19.mlp.shared_experts.down_proj",
176
+ "model.layers.20.self_attn.q_proj",
177
+ "model.layers.20.self_attn.k_proj",
178
+ "model.layers.20.self_attn.v_proj",
179
+ "model.layers.20.self_attn.o_proj",
180
+ "model.layers.20.mlp.shared_experts.gate_proj",
181
+ "model.layers.20.mlp.shared_experts.up_proj",
182
+ "model.layers.20.mlp.shared_experts.down_proj",
183
+ "model.layers.21.self_attn.q_proj",
184
+ "model.layers.21.self_attn.k_proj",
185
+ "model.layers.21.self_attn.v_proj",
186
+ "model.layers.21.self_attn.o_proj",
187
+ "model.layers.21.mlp.shared_experts.gate_proj",
188
+ "model.layers.21.mlp.shared_experts.up_proj",
189
+ "model.layers.21.mlp.shared_experts.down_proj",
190
+ "model.layers.22.self_attn.q_proj",
191
+ "model.layers.22.self_attn.k_proj",
192
+ "model.layers.22.self_attn.v_proj",
193
+ "model.layers.22.self_attn.o_proj",
194
+ "model.layers.22.mlp.shared_experts.gate_proj",
195
+ "model.layers.22.mlp.shared_experts.up_proj",
196
+ "model.layers.22.mlp.shared_experts.down_proj",
197
+ "model.layers.23.self_attn.q_proj",
198
+ "model.layers.23.self_attn.k_proj",
199
+ "model.layers.23.self_attn.v_proj",
200
+ "model.layers.23.self_attn.o_proj",
201
+ "model.layers.23.mlp.shared_experts.gate_proj",
202
+ "model.layers.23.mlp.shared_experts.up_proj",
203
+ "model.layers.23.mlp.shared_experts.down_proj",
204
+ "model.layers.24.self_attn.q_proj",
205
+ "model.layers.24.self_attn.k_proj",
206
+ "model.layers.24.self_attn.v_proj",
207
+ "model.layers.24.self_attn.o_proj",
208
+ "model.layers.24.mlp.shared_experts.gate_proj",
209
+ "model.layers.24.mlp.shared_experts.up_proj",
210
+ "model.layers.24.mlp.shared_experts.down_proj",
211
+ "model.layers.25.self_attn.q_proj",
212
+ "model.layers.25.self_attn.k_proj",
213
+ "model.layers.25.self_attn.v_proj",
214
+ "model.layers.25.self_attn.o_proj",
215
+ "model.layers.25.mlp.shared_experts.gate_proj",
216
+ "model.layers.25.mlp.shared_experts.up_proj",
217
+ "model.layers.25.mlp.shared_experts.down_proj",
218
+ "model.layers.26.self_attn.q_proj",
219
+ "model.layers.26.self_attn.k_proj",
220
+ "model.layers.26.self_attn.v_proj",
221
+ "model.layers.26.self_attn.o_proj",
222
+ "model.layers.26.mlp.shared_experts.gate_proj",
223
+ "model.layers.26.mlp.shared_experts.up_proj",
224
+ "model.layers.26.mlp.shared_experts.down_proj",
225
+ "model.layers.27.self_attn.q_proj",
226
+ "model.layers.27.self_attn.k_proj",
227
+ "model.layers.27.self_attn.v_proj",
228
+ "model.layers.27.self_attn.o_proj",
229
+ "model.layers.27.mlp.shared_experts.gate_proj",
230
+ "model.layers.27.mlp.shared_experts.up_proj",
231
+ "model.layers.27.mlp.shared_experts.down_proj",
232
+ "model.layers.28.self_attn.q_proj",
233
+ "model.layers.28.self_attn.k_proj",
234
+ "model.layers.28.self_attn.v_proj",
235
+ "model.layers.28.self_attn.o_proj",
236
+ "model.layers.28.mlp.shared_experts.gate_proj",
237
+ "model.layers.28.mlp.shared_experts.up_proj",
238
+ "model.layers.28.mlp.shared_experts.down_proj",
239
+ "model.layers.29.self_attn.q_proj",
240
+ "model.layers.29.self_attn.k_proj",
241
+ "model.layers.29.self_attn.v_proj",
242
+ "model.layers.29.self_attn.o_proj",
243
+ "model.layers.29.mlp.shared_experts.gate_proj",
244
+ "model.layers.29.mlp.shared_experts.up_proj",
245
+ "model.layers.29.mlp.shared_experts.down_proj",
246
+ "model.layers.30.self_attn.q_proj",
247
+ "model.layers.30.self_attn.k_proj",
248
+ "model.layers.30.self_attn.v_proj",
249
+ "model.layers.30.self_attn.o_proj",
250
+ "model.layers.30.mlp.shared_experts.gate_proj",
251
+ "model.layers.30.mlp.shared_experts.up_proj",
252
+ "model.layers.30.mlp.shared_experts.down_proj",
253
+ "model.layers.31.self_attn.q_proj",
254
+ "model.layers.31.self_attn.k_proj",
255
+ "model.layers.31.self_attn.v_proj",
256
+ "model.layers.31.self_attn.o_proj",
257
+ "model.layers.31.mlp.shared_experts.gate_proj",
258
+ "model.layers.31.mlp.shared_experts.up_proj",
259
+ "model.layers.31.mlp.shared_experts.down_proj",
260
+ "model.layers.32.self_attn.q_proj",
261
+ "model.layers.32.self_attn.k_proj",
262
+ "model.layers.32.self_attn.v_proj",
263
+ "model.layers.32.self_attn.o_proj",
264
+ "model.layers.32.mlp.shared_experts.gate_proj",
265
+ "model.layers.32.mlp.shared_experts.up_proj",
266
+ "model.layers.32.mlp.shared_experts.down_proj",
267
+ "model.layers.33.self_attn.q_proj",
268
+ "model.layers.33.self_attn.k_proj",
269
+ "model.layers.33.self_attn.v_proj",
270
+ "model.layers.33.self_attn.o_proj",
271
+ "model.layers.33.mlp.shared_experts.gate_proj",
272
+ "model.layers.33.mlp.shared_experts.up_proj",
273
+ "model.layers.33.mlp.shared_experts.down_proj",
274
+ "model.layers.34.self_attn.q_proj",
275
+ "model.layers.34.self_attn.k_proj",
276
+ "model.layers.34.self_attn.v_proj",
277
+ "model.layers.34.self_attn.o_proj",
278
+ "model.layers.34.mlp.shared_experts.gate_proj",
279
+ "model.layers.34.mlp.shared_experts.up_proj",
280
+ "model.layers.34.mlp.shared_experts.down_proj",
281
+ "model.layers.35.self_attn.q_proj",
282
+ "model.layers.35.self_attn.k_proj",
283
+ "model.layers.35.self_attn.v_proj",
284
+ "model.layers.35.self_attn.o_proj",
285
+ "model.layers.35.mlp.shared_experts.gate_proj",
286
+ "model.layers.35.mlp.shared_experts.up_proj",
287
+ "model.layers.35.mlp.shared_experts.down_proj",
288
+ "model.layers.36.self_attn.q_proj",
289
+ "model.layers.36.self_attn.k_proj",
290
+ "model.layers.36.self_attn.v_proj",
291
+ "model.layers.36.self_attn.o_proj",
292
+ "model.layers.36.mlp.shared_experts.gate_proj",
293
+ "model.layers.36.mlp.shared_experts.up_proj",
294
+ "model.layers.36.mlp.shared_experts.down_proj",
295
+ "model.layers.37.self_attn.q_proj",
296
+ "model.layers.37.self_attn.k_proj",
297
+ "model.layers.37.self_attn.v_proj",
298
+ "model.layers.37.self_attn.o_proj",
299
+ "model.layers.37.mlp.shared_experts.gate_proj",
300
+ "model.layers.37.mlp.shared_experts.up_proj",
301
+ "model.layers.37.mlp.shared_experts.down_proj",
302
+ "model.layers.38.self_attn.q_proj",
303
+ "model.layers.38.self_attn.k_proj",
304
+ "model.layers.38.self_attn.v_proj",
305
+ "model.layers.38.self_attn.o_proj",
306
+ "model.layers.38.mlp.shared_experts.gate_proj",
307
+ "model.layers.38.mlp.shared_experts.up_proj",
308
+ "model.layers.38.mlp.shared_experts.down_proj",
309
+ "model.layers.39.self_attn.q_proj",
310
+ "model.layers.39.self_attn.k_proj",
311
+ "model.layers.39.self_attn.v_proj",
312
+ "model.layers.39.self_attn.o_proj",
313
+ "model.layers.39.mlp.shared_experts.gate_proj",
314
+ "model.layers.39.mlp.shared_experts.up_proj",
315
+ "model.layers.39.mlp.shared_experts.down_proj",
316
+ "model.layers.40.self_attn.q_proj",
317
+ "model.layers.40.self_attn.k_proj",
318
+ "model.layers.40.self_attn.v_proj",
319
+ "model.layers.40.self_attn.o_proj",
320
+ "model.layers.40.mlp.shared_experts.gate_proj",
321
+ "model.layers.40.mlp.shared_experts.up_proj",
322
+ "model.layers.40.mlp.shared_experts.down_proj",
323
+ "model.layers.41.self_attn.q_proj",
324
+ "model.layers.41.self_attn.k_proj",
325
+ "model.layers.41.self_attn.v_proj",
326
+ "model.layers.41.self_attn.o_proj",
327
+ "model.layers.41.mlp.shared_experts.gate_proj",
328
+ "model.layers.41.mlp.shared_experts.up_proj",
329
+ "model.layers.41.mlp.shared_experts.down_proj",
330
+ "model.layers.42.self_attn.q_proj",
331
+ "model.layers.42.self_attn.k_proj",
332
+ "model.layers.42.self_attn.v_proj",
333
+ "model.layers.42.self_attn.o_proj",
334
+ "model.layers.42.mlp.shared_experts.gate_proj",
335
+ "model.layers.42.mlp.shared_experts.up_proj",
336
+ "model.layers.42.mlp.shared_experts.down_proj",
337
+ "model.layers.43.self_attn.q_proj",
338
+ "model.layers.43.self_attn.k_proj",
339
+ "model.layers.43.self_attn.v_proj",
340
+ "model.layers.43.self_attn.o_proj",
341
+ "model.layers.43.mlp.shared_experts.gate_proj",
342
+ "model.layers.43.mlp.shared_experts.up_proj",
343
+ "model.layers.43.mlp.shared_experts.down_proj",
344
+ "model.layers.44.self_attn.q_proj",
345
+ "model.layers.44.self_attn.k_proj",
346
+ "model.layers.44.self_attn.v_proj",
347
+ "model.layers.44.self_attn.o_proj",
348
+ "model.layers.44.mlp.shared_experts.gate_proj",
349
+ "model.layers.44.mlp.shared_experts.up_proj",
350
+ "model.layers.44.mlp.shared_experts.down_proj",
351
+ "model.layers.45.self_attn.q_proj",
352
+ "model.layers.45.self_attn.k_proj",
353
+ "model.layers.45.self_attn.v_proj",
354
+ "model.layers.45.self_attn.o_proj",
355
+ "model.layers.45.mlp.shared_experts.gate_proj",
356
+ "model.layers.45.mlp.shared_experts.up_proj",
357
+ "model.layers.45.mlp.shared_experts.down_proj",
358
+ "model.layers.46.self_attn.q_proj",
359
+ "model.layers.46.self_attn.k_proj",
360
+ "model.layers.46.self_attn.v_proj",
361
+ "model.layers.46.self_attn.o_proj",
362
+ "model.layers.46.mlp.shared_experts.gate_proj",
363
+ "model.layers.46.mlp.shared_experts.up_proj",
364
+ "model.layers.46.mlp.shared_experts.down_proj",
365
+ "model.layers.47.self_attn.q_proj",
366
+ "model.layers.47.self_attn.k_proj",
367
+ "model.layers.47.self_attn.v_proj",
368
+ "model.layers.47.self_attn.o_proj",
369
+ "model.layers.47.mlp.shared_experts.gate_proj",
370
+ "model.layers.47.mlp.shared_experts.up_proj",
371
+ "model.layers.47.mlp.shared_experts.down_proj",
372
+ "model.layers.48.self_attn.q_proj",
373
+ "model.layers.48.self_attn.k_proj",
374
+ "model.layers.48.self_attn.v_proj",
375
+ "model.layers.48.self_attn.o_proj",
376
+ "model.layers.48.mlp.shared_experts.gate_proj",
377
+ "model.layers.48.mlp.shared_experts.up_proj",
378
+ "model.layers.48.mlp.shared_experts.down_proj",
379
+ "model.layers.49.self_attn.q_proj",
380
+ "model.layers.49.self_attn.k_proj",
381
+ "model.layers.49.self_attn.v_proj",
382
+ "model.layers.49.self_attn.o_proj",
383
+ "model.layers.49.mlp.shared_experts.gate_proj",
384
+ "model.layers.49.mlp.shared_experts.up_proj",
385
+ "model.layers.49.mlp.shared_experts.down_proj",
386
+ "model.layers.50.self_attn.q_proj",
387
+ "model.layers.50.self_attn.k_proj",
388
+ "model.layers.50.self_attn.v_proj",
389
+ "model.layers.50.self_attn.o_proj",
390
+ "model.layers.50.mlp.shared_experts.gate_proj",
391
+ "model.layers.50.mlp.shared_experts.up_proj",
392
+ "model.layers.50.mlp.shared_experts.down_proj",
393
+ "model.layers.51.self_attn.q_proj",
394
+ "model.layers.51.self_attn.k_proj",
395
+ "model.layers.51.self_attn.v_proj",
396
+ "model.layers.51.self_attn.o_proj",
397
+ "model.layers.51.mlp.shared_experts.gate_proj",
398
+ "model.layers.51.mlp.shared_experts.up_proj",
399
+ "model.layers.51.mlp.shared_experts.down_proj",
400
+ "model.layers.52.self_attn.q_proj",
401
+ "model.layers.52.self_attn.k_proj",
402
+ "model.layers.52.self_attn.v_proj",
403
+ "model.layers.52.self_attn.o_proj",
404
+ "model.layers.52.mlp.shared_experts.gate_proj",
405
+ "model.layers.52.mlp.shared_experts.up_proj",
406
+ "model.layers.52.mlp.shared_experts.down_proj",
407
+ "model.layers.53.self_attn.q_proj",
408
+ "model.layers.53.self_attn.k_proj",
409
+ "model.layers.53.self_attn.v_proj",
410
+ "model.layers.53.self_attn.o_proj",
411
+ "model.layers.53.mlp.shared_experts.gate_proj",
412
+ "model.layers.53.mlp.shared_experts.up_proj",
413
+ "model.layers.53.mlp.shared_experts.down_proj",
414
+ "model.layers.54.self_attn.q_proj",
415
+ "model.layers.54.self_attn.k_proj",
416
+ "model.layers.54.self_attn.v_proj",
417
+ "model.layers.54.self_attn.o_proj",
418
+ "model.layers.54.mlp.shared_experts.gate_proj",
419
+ "model.layers.54.mlp.shared_experts.up_proj",
420
+ "model.layers.54.mlp.shared_experts.down_proj",
421
+ "model.layers.55.self_attn.q_proj",
422
+ "model.layers.55.self_attn.k_proj",
423
+ "model.layers.55.self_attn.v_proj",
424
+ "model.layers.55.self_attn.o_proj",
425
+ "model.layers.55.mlp.shared_experts.gate_proj",
426
+ "model.layers.55.mlp.shared_experts.up_proj",
427
+ "model.layers.55.mlp.shared_experts.down_proj",
428
+ "model.layers.56.self_attn.q_proj",
429
+ "model.layers.56.self_attn.k_proj",
430
+ "model.layers.56.self_attn.v_proj",
431
+ "model.layers.56.self_attn.o_proj",
432
+ "model.layers.56.mlp.shared_experts.gate_proj",
433
+ "model.layers.56.mlp.shared_experts.up_proj",
434
+ "model.layers.56.mlp.shared_experts.down_proj",
435
+ "model.layers.57.self_attn.q_proj",
436
+ "model.layers.57.self_attn.k_proj",
437
+ "model.layers.57.self_attn.v_proj",
438
+ "model.layers.57.self_attn.o_proj",
439
+ "model.layers.57.mlp.shared_experts.gate_proj",
440
+ "model.layers.57.mlp.shared_experts.up_proj",
441
+ "model.layers.57.mlp.shared_experts.down_proj",
442
+ "model.layers.58.self_attn.q_proj",
443
+ "model.layers.58.self_attn.k_proj",
444
+ "model.layers.58.self_attn.v_proj",
445
+ "model.layers.58.self_attn.o_proj",
446
+ "model.layers.58.mlp.shared_experts.gate_proj",
447
+ "model.layers.58.mlp.shared_experts.up_proj",
448
+ "model.layers.58.mlp.shared_experts.down_proj",
449
+ "model.layers.59.self_attn.q_proj",
450
+ "model.layers.59.self_attn.k_proj",
451
+ "model.layers.59.self_attn.v_proj",
452
+ "model.layers.59.self_attn.o_proj",
453
+ "model.layers.59.mlp.shared_experts.gate_proj",
454
+ "model.layers.59.mlp.shared_experts.up_proj",
455
+ "model.layers.59.mlp.shared_experts.down_proj",
456
+ "model.layers.60.self_attn.q_proj",
457
+ "model.layers.60.self_attn.k_proj",
458
+ "model.layers.60.self_attn.v_proj",
459
+ "model.layers.60.self_attn.o_proj",
460
+ "model.layers.60.mlp.shared_experts.gate_proj",
461
+ "model.layers.60.mlp.shared_experts.up_proj",
462
+ "model.layers.60.mlp.shared_experts.down_proj",
463
+ "model.layers.61.self_attn.q_proj",
464
+ "model.layers.61.self_attn.k_proj",
465
+ "model.layers.61.self_attn.v_proj",
466
+ "model.layers.61.self_attn.o_proj",
467
+ "model.layers.61.mlp.shared_experts.gate_proj",
468
+ "model.layers.61.mlp.shared_experts.up_proj",
469
+ "model.layers.61.mlp.shared_experts.down_proj",
470
+ "model.layers.62.self_attn.q_proj",
471
+ "model.layers.62.self_attn.k_proj",
472
+ "model.layers.62.self_attn.v_proj",
473
+ "model.layers.62.self_attn.o_proj",
474
+ "model.layers.62.mlp.shared_experts.gate_proj",
475
+ "model.layers.62.mlp.shared_experts.up_proj",
476
+ "model.layers.62.mlp.shared_experts.down_proj",
477
+ "model.layers.63.self_attn.q_proj",
478
+ "model.layers.63.self_attn.k_proj",
479
+ "model.layers.63.self_attn.v_proj",
480
+ "model.layers.63.self_attn.o_proj",
481
+ "model.layers.63.mlp.shared_experts.gate_proj",
482
+ "model.layers.63.mlp.shared_experts.up_proj",
483
+ "model.layers.63.mlp.shared_experts.down_proj",
484
+ "model.layers.64.self_attn.q_proj",
485
+ "model.layers.64.self_attn.k_proj",
486
+ "model.layers.64.self_attn.v_proj",
487
+ "model.layers.64.self_attn.o_proj",
488
+ "model.layers.64.mlp.shared_experts.gate_proj",
489
+ "model.layers.64.mlp.shared_experts.up_proj",
490
+ "model.layers.64.mlp.shared_experts.down_proj",
491
+ "model.layers.65.self_attn.q_proj",
492
+ "model.layers.65.self_attn.k_proj",
493
+ "model.layers.65.self_attn.v_proj",
494
+ "model.layers.65.self_attn.o_proj",
495
+ "model.layers.65.mlp.shared_experts.gate_proj",
496
+ "model.layers.65.mlp.shared_experts.up_proj",
497
+ "model.layers.65.mlp.shared_experts.down_proj",
498
+ "model.layers.66.self_attn.q_proj",
499
+ "model.layers.66.self_attn.k_proj",
500
+ "model.layers.66.self_attn.v_proj",
501
+ "model.layers.66.self_attn.o_proj",
502
+ "model.layers.66.mlp.shared_experts.gate_proj",
503
+ "model.layers.66.mlp.shared_experts.up_proj",
504
+ "model.layers.66.mlp.shared_experts.down_proj",
505
+ "model.layers.67.self_attn.q_proj",
506
+ "model.layers.67.self_attn.k_proj",
507
+ "model.layers.67.self_attn.v_proj",
508
+ "model.layers.67.self_attn.o_proj",
509
+ "model.layers.67.mlp.shared_experts.gate_proj",
510
+ "model.layers.67.mlp.shared_experts.up_proj",
511
+ "model.layers.67.mlp.shared_experts.down_proj",
512
+ "model.layers.68.self_attn.q_proj",
513
+ "model.layers.68.self_attn.k_proj",
514
+ "model.layers.68.self_attn.v_proj",
515
+ "model.layers.68.self_attn.o_proj",
516
+ "model.layers.68.mlp.shared_experts.gate_proj",
517
+ "model.layers.68.mlp.shared_experts.up_proj",
518
+ "model.layers.68.mlp.shared_experts.down_proj",
519
+ "model.layers.69.self_attn.q_proj",
520
+ "model.layers.69.self_attn.k_proj",
521
+ "model.layers.69.self_attn.v_proj",
522
+ "model.layers.69.self_attn.o_proj",
523
+ "model.layers.69.mlp.shared_experts.gate_proj",
524
+ "model.layers.69.mlp.shared_experts.up_proj",
525
+ "model.layers.69.mlp.shared_experts.down_proj",
526
+ "model.layers.70.self_attn.q_proj",
527
+ "model.layers.70.self_attn.k_proj",
528
+ "model.layers.70.self_attn.v_proj",
529
+ "model.layers.70.self_attn.o_proj",
530
+ "model.layers.70.mlp.shared_experts.gate_proj",
531
+ "model.layers.70.mlp.shared_experts.up_proj",
532
+ "model.layers.70.mlp.shared_experts.down_proj",
533
+ "model.layers.71.self_attn.q_proj",
534
+ "model.layers.71.self_attn.k_proj",
535
+ "model.layers.71.self_attn.v_proj",
536
+ "model.layers.71.self_attn.o_proj",
537
+ "model.layers.71.mlp.shared_experts.gate_proj",
538
+ "model.layers.71.mlp.shared_experts.up_proj",
539
+ "model.layers.71.mlp.shared_experts.down_proj",
540
+ "model.layers.72.self_attn.q_proj",
541
+ "model.layers.72.self_attn.k_proj",
542
+ "model.layers.72.self_attn.v_proj",
543
+ "model.layers.72.self_attn.o_proj",
544
+ "model.layers.72.mlp.shared_experts.gate_proj",
545
+ "model.layers.72.mlp.shared_experts.up_proj",
546
+ "model.layers.72.mlp.shared_experts.down_proj",
547
+ "model.layers.73.self_attn.q_proj",
548
+ "model.layers.73.self_attn.k_proj",
549
+ "model.layers.73.self_attn.v_proj",
550
+ "model.layers.73.self_attn.o_proj",
551
+ "model.layers.73.mlp.shared_experts.gate_proj",
552
+ "model.layers.73.mlp.shared_experts.up_proj",
553
+ "model.layers.73.mlp.shared_experts.down_proj",
554
+ "model.layers.74.self_attn.q_proj",
555
+ "model.layers.74.self_attn.k_proj",
556
+ "model.layers.74.self_attn.v_proj",
557
+ "model.layers.74.self_attn.o_proj",
558
+ "model.layers.74.mlp.shared_experts.gate_proj",
559
+ "model.layers.74.mlp.shared_experts.up_proj",
560
+ "model.layers.74.mlp.shared_experts.down_proj",
561
+ "model.layers.75.self_attn.q_proj",
562
+ "model.layers.75.self_attn.k_proj",
563
+ "model.layers.75.self_attn.v_proj",
564
+ "model.layers.75.self_attn.o_proj",
565
+ "model.layers.75.mlp.shared_experts.gate_proj",
566
+ "model.layers.75.mlp.shared_experts.up_proj",
567
+ "model.layers.75.mlp.shared_experts.down_proj",
568
+ "model.layers.76.self_attn.q_proj",
569
+ "model.layers.76.self_attn.k_proj",
570
+ "model.layers.76.self_attn.v_proj",
571
+ "model.layers.76.self_attn.o_proj",
572
+ "model.layers.76.mlp.shared_experts.gate_proj",
573
+ "model.layers.76.mlp.shared_experts.up_proj",
574
+ "model.layers.76.mlp.shared_experts.down_proj",
575
+ "model.layers.77.self_attn.q_proj",
576
+ "model.layers.77.self_attn.k_proj",
577
+ "model.layers.77.self_attn.v_proj",
578
+ "model.layers.77.self_attn.o_proj",
579
+ "model.layers.77.mlp.shared_experts.gate_proj",
580
+ "model.layers.77.mlp.shared_experts.up_proj",
581
+ "model.layers.77.mlp.shared_experts.down_proj",
582
+ "model.layers.78.self_attn.q_proj",
583
+ "model.layers.78.self_attn.k_proj",
584
+ "model.layers.78.self_attn.v_proj",
585
+ "model.layers.78.self_attn.o_proj",
586
+ "model.layers.78.mlp.shared_experts.gate_proj",
587
+ "model.layers.78.mlp.shared_experts.up_proj",
588
+ "model.layers.78.mlp.shared_experts.down_proj",
589
+ "model.layers.79.self_attn.q_proj",
590
+ "model.layers.79.self_attn.k_proj",
591
+ "model.layers.79.self_attn.v_proj",
592
+ "model.layers.79.self_attn.o_proj",
593
+ "model.layers.79.mlp.shared_experts.gate_proj",
594
+ "model.layers.79.mlp.shared_experts.up_proj",
595
+ "model.layers.79.mlp.shared_experts.down_proj",
596
+ "model.layers.80.self_attn.q_proj",
597
+ "model.layers.80.self_attn.k_proj",
598
+ "model.layers.80.self_attn.v_proj",
599
+ "model.layers.80.self_attn.o_proj",
600
+ "model.layers.80.mlp.shared_experts.gate_proj",
601
+ "model.layers.80.mlp.shared_experts.up_proj",
602
+ "model.layers.80.mlp.shared_experts.down_proj",
603
+ "model.layers.81.self_attn.q_proj",
604
+ "model.layers.81.self_attn.k_proj",
605
+ "model.layers.81.self_attn.v_proj",
606
+ "model.layers.81.self_attn.o_proj",
607
+ "model.layers.81.mlp.shared_experts.gate_proj",
608
+ "model.layers.81.mlp.shared_experts.up_proj",
609
+ "model.layers.81.mlp.shared_experts.down_proj",
610
+ "model.layers.82.self_attn.q_proj",
611
+ "model.layers.82.self_attn.k_proj",
612
+ "model.layers.82.self_attn.v_proj",
613
+ "model.layers.82.self_attn.o_proj",
614
+ "model.layers.82.mlp.shared_experts.gate_proj",
615
+ "model.layers.82.mlp.shared_experts.up_proj",
616
+ "model.layers.82.mlp.shared_experts.down_proj",
617
+ "model.layers.83.self_attn.q_proj",
618
+ "model.layers.83.self_attn.k_proj",
619
+ "model.layers.83.self_attn.v_proj",
620
+ "model.layers.83.self_attn.o_proj",
621
+ "model.layers.83.mlp.shared_experts.gate_proj",
622
+ "model.layers.83.mlp.shared_experts.up_proj",
623
+ "model.layers.83.mlp.shared_experts.down_proj",
624
+ "model.layers.84.self_attn.q_proj",
625
+ "model.layers.84.self_attn.k_proj",
626
+ "model.layers.84.self_attn.v_proj",
627
+ "model.layers.84.self_attn.o_proj",
628
+ "model.layers.84.mlp.shared_experts.gate_proj",
629
+ "model.layers.84.mlp.shared_experts.up_proj",
630
+ "model.layers.84.mlp.shared_experts.down_proj",
631
+ "model.layers.85.self_attn.q_proj",
632
+ "model.layers.85.self_attn.k_proj",
633
+ "model.layers.85.self_attn.v_proj",
634
+ "model.layers.85.self_attn.o_proj",
635
+ "model.layers.85.mlp.shared_experts.gate_proj",
636
+ "model.layers.85.mlp.shared_experts.up_proj",
637
+ "model.layers.85.mlp.shared_experts.down_proj",
638
+ "model.layers.86.self_attn.q_proj",
639
+ "model.layers.86.self_attn.k_proj",
640
+ "model.layers.86.self_attn.v_proj",
641
+ "model.layers.86.self_attn.o_proj",
642
+ "model.layers.86.mlp.shared_experts.gate_proj",
643
+ "model.layers.86.mlp.shared_experts.up_proj",
644
+ "model.layers.86.mlp.shared_experts.down_proj",
645
+ "model.layers.87.self_attn.q_proj",
646
+ "model.layers.87.self_attn.k_proj",
647
+ "model.layers.87.self_attn.v_proj",
648
+ "model.layers.87.self_attn.o_proj",
649
+ "model.layers.87.mlp.shared_experts.gate_proj",
650
+ "model.layers.87.mlp.shared_experts.up_proj",
651
+ "model.layers.87.mlp.shared_experts.down_proj",
652
+ "model.layers.88.self_attn.q_proj",
653
+ "model.layers.88.self_attn.k_proj",
654
+ "model.layers.88.self_attn.v_proj",
655
+ "model.layers.88.self_attn.o_proj",
656
+ "model.layers.88.mlp.shared_experts.gate_proj",
657
+ "model.layers.88.mlp.shared_experts.up_proj",
658
+ "model.layers.88.mlp.shared_experts.down_proj",
659
+ "model.layers.89.self_attn.q_proj",
660
+ "model.layers.89.self_attn.k_proj",
661
+ "model.layers.89.self_attn.v_proj",
662
+ "model.layers.89.self_attn.o_proj",
663
+ "model.layers.89.mlp.shared_experts.gate_proj",
664
+ "model.layers.89.mlp.shared_experts.up_proj",
665
+ "model.layers.89.mlp.shared_experts.down_proj",
666
+ "model.layers.90.self_attn.q_proj",
667
+ "model.layers.90.self_attn.k_proj",
668
+ "model.layers.90.self_attn.v_proj",
669
+ "model.layers.90.self_attn.o_proj",
670
+ "model.layers.90.mlp.shared_experts.gate_proj",
671
+ "model.layers.90.mlp.shared_experts.up_proj",
672
+ "model.layers.90.mlp.shared_experts.down_proj",
673
+ "model.layers.91.self_attn.q_proj",
674
+ "model.layers.91.self_attn.k_proj",
675
+ "model.layers.91.self_attn.v_proj",
676
+ "model.layers.91.self_attn.o_proj",
677
+ "model.layers.91.mlp.shared_experts.gate_proj",
678
+ "model.layers.91.mlp.shared_experts.up_proj",
679
+ "model.layers.91.mlp.shared_experts.down_proj",
680
+ "lm_head"
681
+ ],
682
+ "export": {
683
+ "kv_cache_group": [
684
+ "*k_proj",
685
+ "*v_proj"
686
+ ],
687
+ "min_kv_scale": 0.0,
688
+ "pack_method": "reorder",
689
+ "weight_format": "real_quantized",
690
+ "weight_merge_groups": null
691
+ },
692
+ "global_quant_config": {
693
+ "bias": null,
694
+ "input_tensors": {
695
+ "ch_axis": -1,
696
+ "dtype": "fp4",
697
+ "group_size": 32,
698
+ "is_dynamic": true,
699
+ "is_scale_quant": false,
700
+ "mx_element_dtype": null,
701
+ "observer_cls": "PerBlockMXObserver",
702
+ "qscheme": "per_group",
703
+ "round_method": "half_even",
704
+ "scale_calculation_mode": "even",
705
+ "scale_format": "e8m0",
706
+ "scale_type": "float",
707
+ "symmetric": null
708
+ },
709
+ "output_tensors": null,
710
+ "target_device": null,
711
+ "weight": {
712
+ "ch_axis": -1,
713
+ "dtype": "fp4",
714
+ "group_size": 32,
715
+ "is_dynamic": false,
716
+ "is_scale_quant": false,
717
+ "mx_element_dtype": null,
718
+ "observer_cls": "PerBlockMXObserver",
719
+ "qscheme": "per_group",
720
+ "round_method": "half_even",
721
+ "scale_calculation_mode": "even",
722
+ "scale_format": "e8m0",
723
+ "scale_type": "float",
724
+ "symmetric": null
725
+ }
726
+ },
727
+ "kv_cache_post_rope": false,
728
+ "kv_cache_quant_config": {
729
+ "*k_proj": {
730
+ "bias": null,
731
+ "input_tensors": null,
732
+ "output_tensors": {
733
+ "ch_axis": null,
734
+ "dtype": "fp8_e4m3",
735
+ "group_size": null,
736
+ "is_dynamic": false,
737
+ "is_scale_quant": false,
738
+ "mx_element_dtype": null,
739
+ "observer_cls": "PerTensorMinMaxObserver",
740
+ "qscheme": "per_tensor",
741
+ "round_method": "half_even",
742
+ "scale_calculation_mode": null,
743
+ "scale_format": null,
744
+ "scale_type": null,
745
+ "symmetric": true
746
+ },
747
+ "target_device": null,
748
+ "weight": null
749
+ },
750
+ "*v_proj": {
751
+ "bias": null,
752
+ "input_tensors": null,
753
+ "output_tensors": {
754
+ "ch_axis": null,
755
+ "dtype": "fp8_e4m3",
756
+ "group_size": null,
757
+ "is_dynamic": false,
758
+ "is_scale_quant": false,
759
+ "mx_element_dtype": null,
760
+ "observer_cls": "PerTensorMinMaxObserver",
761
+ "qscheme": "per_tensor",
762
+ "round_method": "half_even",
763
+ "scale_calculation_mode": null,
764
+ "scale_format": null,
765
+ "scale_type": null,
766
+ "symmetric": true
767
+ },
768
+ "target_device": null,
769
+ "weight": null
770
+ }
771
+ },
772
+ "layer_quant_config": {
773
+ "*k_proj": {
774
+ "bias": null,
775
+ "input_tensors": {
776
+ "ch_axis": -1,
777
+ "dtype": "fp4",
778
+ "group_size": 32,
779
+ "is_dynamic": true,
780
+ "is_scale_quant": false,
781
+ "mx_element_dtype": null,
782
+ "observer_cls": "PerBlockMXObserver",
783
+ "qscheme": "per_group",
784
+ "round_method": "half_even",
785
+ "scale_calculation_mode": "even",
786
+ "scale_format": "e8m0",
787
+ "scale_type": "float",
788
+ "symmetric": null
789
+ },
790
+ "output_tensors": {
791
+ "ch_axis": null,
792
+ "dtype": "fp8_e4m3",
793
+ "group_size": null,
794
+ "is_dynamic": false,
795
+ "is_scale_quant": false,
796
+ "mx_element_dtype": null,
797
+ "observer_cls": "PerTensorMinMaxObserver",
798
+ "qscheme": "per_tensor",
799
+ "round_method": "half_even",
800
+ "scale_calculation_mode": null,
801
+ "scale_format": null,
802
+ "scale_type": null,
803
+ "symmetric": true
804
+ },
805
+ "target_device": null,
806
+ "weight": {
807
+ "ch_axis": -1,
808
+ "dtype": "fp4",
809
+ "group_size": 32,
810
+ "is_dynamic": false,
811
+ "is_scale_quant": false,
812
+ "mx_element_dtype": null,
813
+ "observer_cls": "PerBlockMXObserver",
814
+ "qscheme": "per_group",
815
+ "round_method": "half_even",
816
+ "scale_calculation_mode": "even",
817
+ "scale_format": "e8m0",
818
+ "scale_type": "float",
819
+ "symmetric": null
820
+ }
821
+ },
822
+ "*v_proj": {
823
+ "bias": null,
824
+ "input_tensors": {
825
+ "ch_axis": -1,
826
+ "dtype": "fp4",
827
+ "group_size": 32,
828
+ "is_dynamic": true,
829
+ "is_scale_quant": false,
830
+ "mx_element_dtype": null,
831
+ "observer_cls": "PerBlockMXObserver",
832
+ "qscheme": "per_group",
833
+ "round_method": "half_even",
834
+ "scale_calculation_mode": "even",
835
+ "scale_format": "e8m0",
836
+ "scale_type": "float",
837
+ "symmetric": null
838
+ },
839
+ "output_tensors": {
840
+ "ch_axis": null,
841
+ "dtype": "fp8_e4m3",
842
+ "group_size": null,
843
+ "is_dynamic": false,
844
+ "is_scale_quant": false,
845
+ "mx_element_dtype": null,
846
+ "observer_cls": "PerTensorMinMaxObserver",
847
+ "qscheme": "per_tensor",
848
+ "round_method": "half_even",
849
+ "scale_calculation_mode": null,
850
+ "scale_format": null,
851
+ "scale_type": null,
852
+ "symmetric": true
853
+ },
854
+ "target_device": null,
855
+ "weight": {
856
+ "ch_axis": -1,
857
+ "dtype": "fp4",
858
+ "group_size": 32,
859
+ "is_dynamic": false,
860
+ "is_scale_quant": false,
861
+ "mx_element_dtype": null,
862
+ "observer_cls": "PerBlockMXObserver",
863
+ "qscheme": "per_group",
864
+ "round_method": "half_even",
865
+ "scale_calculation_mode": "even",
866
+ "scale_format": "e8m0",
867
+ "scale_type": "float",
868
+ "symmetric": null
869
+ }
870
+ }
871
+ },
872
+ "layer_type_quant_config": {},
873
+ "quant_method": "quark",
874
+ "quant_mode": "eager_mode",
875
+ "softmax_quant_spec": null,
876
+ "version": "0.12+86d5498526"
877
+ },
878
+ "rms_norm_eps": 1e-05,
879
+ "rope_scaling": null,
880
+ "rope_theta": 1000000,
881
+ "routed_scaling_factor": 2.5,
882
+ "tie_word_embeddings": false,
883
+ "topk_group": 1,
884
+ "transformers_version": "4.57.3",
885
+ "use_cache": true,
886
+ "use_qk_norm": true,
887
+ "vocab_size": 151552
888
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 151329,
5
+ 151336,
6
+ 151338
7
+ ],
8
+ "pad_token_id": 151329,
9
+ "transformers_version": "4.57.3"
10
+ }
model-00001-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa8a85062a20aa6d545b02208415beb8bfbf56902e5574d385b096cc3820e429
3
+ size 4999262112
model-00002-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51481d582dbca4f2cd38216239c5586728f5cdd923f71e2793f3c2ada3eeaf8f
3
+ size 4996665376
model-00003-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a6c025c5faed469b71068a5e91ef3987f7f212077a4e176044c4b7bde08373f
3
+ size 4996665392
model-00004-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7513f27fa412201ceb3157f5b14c8cafb22105badc3be44f29ca0d6e6821aec9
3
+ size 4945829980
model-00005-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7273faec42bf5506b3e6d8f0bf59f02af22b16ad97de354feda58a349be0234b
3
+ size 4997152264
model-00006-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49c0560c25d58a3431c3a18f9408be9d80e056c7db41b99799a2febbe7c2d765
3
+ size 4996667344
model-00007-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7723d6d4daed1914deb2a4f30bb18ce6661b19ca2c2afd1e32d66e8c39fdbf9
3
+ size 4996667344
model-00008-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31896b6f3ef62393d1c89fe3c3b26c374e2715b84069fab5a5baf10c40a3a45c
3
+ size 4996667344
model-00009-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acd9cf7bacf42508889ea44938131d0207717054a1c2ee14c00e2f285cb31907
3
+ size 4996667504
model-00010-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46e4969e58e04fea0aedf6d0e3a6a21e75788de03f4072d61c090de1b4c73cf8
3
+ size 4996667512
model-00011-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b403d11dccb7740c042700faa59cbdb46d568e740dfdd2693cc31b3ec4f5e7aa
3
+ size 4924941108
model-00012-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8378d961b519a8df1c94391a446dcb906b9acc8bfcb57cf878a7b5ab0ea526
3
+ size 4997152264
model-00013-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c428f2d9a67603bdb1a0c502c9f8163e9bfad5512ceda816a6e9c45c2b997ec2
3
+ size 4996667344
model-00014-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:213857e8ec146d9b5d3fc44b7f5a5a4c23f3e1aaa9c5ce5b3e4bbc290b8b36e9
3
+ size 4996667344
model-00015-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcab04185d210b1a9c3ca0190ddd8c26b0cdfb0f12c9631a96c2a284d51b7997
3
+ size 4996667344
model-00016-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7251724c010d85ec1fb66d5fe8813194e366953fdc5e82ebf2f6a077c3bce6c
3
+ size 4996667504
model-00017-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a7570588e6c57d4a390dbc8a0eb6061d7c0be8900f35b788e06f025d7cbe3c
3
+ size 4996667512
model-00018-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73a2ea8dc14c49bcd374d279bd6c53c4703845bc997fdbae6ca7cca5a366fb81
3
+ size 4924941108
model-00019-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9c0f765e6e0619002643bdf46c92b09e9ce6bed40605cd3cd357b3075fca39c
3
+ size 4997152264
model-00020-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fec0e8202fd73e852d180876123ab960328445341b33b3d10931a62bd172dec
3
+ size 4996667344
model-00021-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e83d94fe78e4205cf1ee611e7fb03b33646e32d16b6265060356805d6d83b0d3
3
+ size 4996667344
model-00022-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d96d9134171308283f3294111141d2d1a67a1056448a12886c7215d668e9b28
3
+ size 4996667344
model-00023-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67643a0a9cbc2e36856fdd47dd677493a75e5c7656e970395619e4405689751c
3
+ size 4996667504
model-00024-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58c79ba4ed65166cf874cef171c6e22965a3bc9b30adcd1e4b333acb6bf6de55
3
+ size 4996667512
model-00025-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1f2c7e3322b901c4931571f177d1c2c9d582b14db41a9c58b03dde417d34438
3
+ size 4924941108
model-00026-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d25e5a2501488d6d0d4745949b315cd643baf26ab786957f3ff6ca035006d19
3
+ size 4997152264
model-00027-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfd868c17bd59c84707540cc14a03594504d6d94da13470b216c3645655c3d9c
3
+ size 4996667344
model-00028-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73f261e78b9446ae1103e68079a4c8c50acb8646c74c4b023319b2ee942b73fe
3
+ size 4996667344
model-00029-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d15062b0ab45e9bc85930ea994d19e32266d3c777f5003c8400762fe9102e2d
3
+ size 4996667344
model-00030-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e5aa03de144c6c34e9657f334e3b8a0f2dd699b0dbfa3e50b182bc993551c5a
3
+ size 4996667504
model-00031-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28be47b4cef5ee319d4468f41c6a800a8f3d7111db946e4e4c53ada73b96a0c8
3
+ size 4996667512
model-00032-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66212c0aa3d80f51f9d90996c724d6c5a46b70f4d1012803456a6016ce835fa0
3
+ size 4924941108
model-00033-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5700bc43e28db43246ec345399656b59d44bcaada3a40cb127c8176fd8e1bf6
3
+ size 4997152264
model-00034-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9c6ce5b990427ef437ccf26361c378f6d2774ddf02068fef07ed11d1a58f88a
3
+ size 4996667344
model-00035-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8343f4dc2421768a2f0141a62bf5b9584b00b5d9220b658369a1f4bfb958ddc5
3
+ size 4996667344
model-00036-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a1824b8e6366be26a62d06fd01bd1113558177fa076cdfecd732176f0eb3c5b
3
+ size 4996667344
model-00037-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36bc26ce64e706174888ea5320077dd7da06a08709a06e3c2235fbd42a4dc2d9
3
+ size 4996667504
model-00038-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f40ab1a2568c2382e341a019b28b761d70c199cf18cb9a9ecbea0c3b7c9b1420
3
+ size 4996667512
model-00039-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edd4d023883afbd38a9af3afb3cb70f66acb4dfd2b6fbeb5a520d59dfaac5027
3
+ size 4924941108
model-00040-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ed9c48d2c1336a38a4b4f77c410e9bfb06e6486cd4817c72f6aa8f38c739acf
3
+ size 4997152264
model-00041-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7ec2aab5238068627a4ab98aef9a7f0b7476980f282ead76bd2f77a5808c3d6
3
+ size 4996667344
model-00042-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e1aaaff029839e6fdc2c690d0d6c245f9aa8b079ffe63c2f87c2024d597d3cf
3
+ size 4996667344
model-00043-of-00043.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a2ac56fc0d97a7ade5134400fbaf340b76fb7f7b8f2d1ce685a0e6738180a5c
3
+ size 2703786720
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>",
17
+ "<|begin_of_audio|>",
18
+ "<|end_of_audio|>",
19
+ "<|begin_of_transcription|>",
20
+ "<|end_of_transcription|>",
21
+ "<|code_prefix|>",
22
+ "<|code_middle|>",
23
+ "<|code_suffix|>",
24
+ "/nothink"
25
+ ],
26
+ "eos_token": {
27
+ "content": "<|endoftext|>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "pad_token": "<|endoftext|>"
34
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8193dbf0c325b82710856f752ac57dc957338fb85d6a23bb388fc110b8b58b76
3
+ size 19970965