baseten-admin commited on
Commit
9167693
·
verified ·
1 Parent(s): 79638ea

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. chat_template.jinja +108 -0
  3. config.json +398 -0
  4. configuration_deepseek.py +212 -0
  5. generation_config.json +4 -0
  6. hf_quant_config.json +319 -0
  7. model-00001-of-00123.safetensors +3 -0
  8. model-00002-of-00123.safetensors +3 -0
  9. model-00003-of-00123.safetensors +3 -0
  10. model-00004-of-00123.safetensors +3 -0
  11. model-00005-of-00123.safetensors +3 -0
  12. model-00006-of-00123.safetensors +3 -0
  13. model-00007-of-00123.safetensors +3 -0
  14. model-00008-of-00123.safetensors +3 -0
  15. model-00009-of-00123.safetensors +3 -0
  16. model-00010-of-00123.safetensors +3 -0
  17. model-00011-of-00123.safetensors +3 -0
  18. model-00012-of-00123.safetensors +3 -0
  19. model-00013-of-00123.safetensors +3 -0
  20. model-00014-of-00123.safetensors +3 -0
  21. model-00015-of-00123.safetensors +3 -0
  22. model-00016-of-00123.safetensors +3 -0
  23. model-00017-of-00123.safetensors +3 -0
  24. model-00018-of-00123.safetensors +3 -0
  25. model-00019-of-00123.safetensors +3 -0
  26. model-00020-of-00123.safetensors +3 -0
  27. model-00021-of-00123.safetensors +3 -0
  28. model-00022-of-00123.safetensors +3 -0
  29. model-00023-of-00123.safetensors +3 -0
  30. model-00024-of-00123.safetensors +3 -0
  31. model-00025-of-00123.safetensors +3 -0
  32. model-00026-of-00123.safetensors +3 -0
  33. model-00027-of-00123.safetensors +3 -0
  34. model-00028-of-00123.safetensors +3 -0
  35. model-00029-of-00123.safetensors +3 -0
  36. model-00030-of-00123.safetensors +3 -0
  37. model-00031-of-00123.safetensors +3 -0
  38. model-00032-of-00123.safetensors +3 -0
  39. model-00033-of-00123.safetensors +3 -0
  40. model-00034-of-00123.safetensors +3 -0
  41. model-00035-of-00123.safetensors +3 -0
  42. model-00036-of-00123.safetensors +3 -0
  43. model-00037-of-00123.safetensors +3 -0
  44. model-00038-of-00123.safetensors +3 -0
  45. model-00039-of-00123.safetensors +3 -0
  46. model-00040-of-00123.safetensors +3 -0
  47. model-00041-of-00123.safetensors +3 -0
  48. model-00042-of-00123.safetensors +3 -0
  49. model-00043-of-00123.safetensors +3 -0
  50. model-00044-of-00123.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
8
+ <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
10
+ <|kimi_k25_video_placeholder|>
11
+ {% else -%}
12
+ {{ content['text'] }}
13
+ {%- endif -%}
14
+ {%- endfor -%}
15
+ {%- endif -%}
16
+ {%- endmacro -%}
17
+
18
+ {% macro set_roles(message) -%}
19
+ {%- set role_name = message.get('name') or message['role'] -%}
20
+ {%- if message['role'] == 'user' -%}
21
+ <|im_user|>{{role_name}}<|im_middle|>
22
+ {%- elif message['role'] == 'assistant' -%}
23
+ <|im_assistant|>{{role_name}}<|im_middle|>
24
+ {%- else -%}
25
+ <|im_system|>{{role_name}}<|im_middle|>
26
+ {%- endif -%}
27
+ {%- endmacro -%}
28
+
29
+
30
+ {%- macro render_toolcalls(message) -%}
31
+ <|tool_calls_section_begin|>
32
+ {%- for tool_call in message['tool_calls'] -%}
33
+ {%- set formatted_id = tool_call['id'] -%}
34
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
35
+ {%- endfor -%}
36
+ <|tool_calls_section_end|>
37
+ {%- endmacro -%}
38
+
39
+
40
+ {# Find last non-tool-call assisitant message #}
41
+ {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
42
+ {%- for idx in range(messages|length-1, -1, -1) -%}
43
+ {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
44
+ {%- set ns.last_non_tool_call_assistant_msg = idx -%}
45
+ {%- break -%}
46
+ {%- endif -%}
47
+ {%- endfor -%}
48
+
49
+ {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
50
+ {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
51
+ {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
52
+
53
+ {%- if tools -%}
54
+ {%- if tools_ts_str -%}
55
+ <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
56
+ {%- else -%}
57
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
58
+ {%- endif -%}
59
+ {%- endif -%}
60
+
61
+ {%- for message in hist_msgs -%}
62
+ {{set_roles(message)}}
63
+ {%- if message['role'] == 'assistant' -%}
64
+ <think></think>{{render_content(message)}}
65
+ {%- if message.get('tool_calls') -%}
66
+ {{render_toolcalls(message)}}
67
+ {%- endif -%}
68
+ {%- elif message['role'] == 'tool' -%}
69
+ {%- set tool_call_id = message.tool_call_id -%}
70
+ ## Return of {{ tool_call_id }}
71
+ {{render_content(message)}}
72
+ {%- elif message['content'] is not none -%}
73
+ {{render_content(message)}}
74
+ {%- endif -%}
75
+ <|im_end|>
76
+ {%- endfor -%}
77
+
78
+ {%- for message in suffix_msgs -%}
79
+ {{set_roles(message)}}
80
+ {%- if message['role'] == 'assistant' -%}
81
+ {%- if thinking is defined and thinking is false -%}
82
+ <think></think>{{render_content(message)}}
83
+ {%- else -%}
84
+ {%- set rc = message.get('reasoning_content', '') -%}
85
+ <think>{{rc}}</think>{{render_content(message)}}
86
+ {%- endif -%}
87
+ {%- if message.get('tool_calls') -%}
88
+ {{render_toolcalls(message)}}
89
+ {%- endif -%}
90
+ {%- elif message['role'] == 'tool' -%}
91
+ {%- set tool_call_id = message.tool_call_id -%}
92
+ ## Return of {{ tool_call_id }}
93
+ {{render_content(message)}}
94
+ {%- elif message['content'] is not none -%}
95
+ {{render_content(message)}}
96
+ {%- endif -%}
97
+ <|im_end|>
98
+ {%- endfor -%}
99
+
100
+
101
+ {%- if add_generation_prompt -%}
102
+ <|im_assistant|>assistant<|im_middle|>
103
+ {%- if thinking is defined and thinking is false -%}
104
+ <think></think>
105
+ {%- else -%}
106
+ <think>
107
+ {%- endif -%}
108
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "architectures": [
4
+ "DeepseekV3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
10
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
11
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
12
+ },
13
+ "aux_loss_alpha": 0.001,
14
+ "bos_token_id": 163584,
15
+ "dtype": "bfloat16",
16
+ "eos_token_id": 163586,
17
+ "ep_size": 1,
18
+ "first_k_dense_replace": 1,
19
+ "hidden_act": "silu",
20
+ "hidden_size": 7168,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 18432,
23
+ "kv_lora_rank": 512,
24
+ "max_position_embeddings": 262144,
25
+ "model_type": "deepseek_v3",
26
+ "moe_intermediate_size": 2048,
27
+ "moe_layer_freq": 1,
28
+ "n_group": 1,
29
+ "n_routed_experts": 384,
30
+ "n_shared_experts": 1,
31
+ "norm_topk_prob": true,
32
+ "num_attention_heads": 64,
33
+ "num_experts_per_tok": 8,
34
+ "num_hidden_layers": 61,
35
+ "num_key_value_heads": 64,
36
+ "num_nextn_predict_layers": 0,
37
+ "pad_token_id": 163839,
38
+ "pretraining_tp": 1,
39
+ "q_lora_rank": 1536,
40
+ "qk_nope_head_dim": 128,
41
+ "qk_rope_head_dim": 64,
42
+ "rms_norm_eps": 1e-05,
43
+ "rope_scaling": {
44
+ "beta_fast": 1.0,
45
+ "beta_slow": 1.0,
46
+ "factor": 64.0,
47
+ "mscale": 1.0,
48
+ "mscale_all_dim": 1.0,
49
+ "original_max_position_embeddings": 4096,
50
+ "type": "yarn"
51
+ },
52
+ "rope_theta": 50000.0,
53
+ "routed_scaling_factor": 2.827,
54
+ "scoring_func": "sigmoid",
55
+ "seq_aux": true,
56
+ "tie_word_embeddings": false,
57
+ "topk_group": 1,
58
+ "topk_method": "noaux_tc",
59
+ "transformers_version": "4.57.6",
60
+ "use_cache": true,
61
+ "v_head_dim": 128,
62
+ "vocab_size": 163840,
63
+ "quantization_config": {
64
+ "config_groups": {
65
+ "group_0": {
66
+ "input_activations": {
67
+ "dynamic": false,
68
+ "num_bits": 4,
69
+ "type": "float",
70
+ "group_size": 16
71
+ },
72
+ "weights": {
73
+ "dynamic": false,
74
+ "num_bits": 4,
75
+ "type": "float",
76
+ "group_size": 16
77
+ },
78
+ "targets": [
79
+ "Linear"
80
+ ]
81
+ }
82
+ },
83
+ "ignore": [
84
+ "lm_head",
85
+ "model.layers.0.mlp*",
86
+ "model.layers.0.self_attn.kv_a_proj_with_mqa",
87
+ "model.layers.0.self_attn.kv_b_proj",
88
+ "model.layers.0.self_attn.q_a_proj",
89
+ "model.layers.0.self_attn.q_b_proj",
90
+ "model.layers.1.mlp.shared_experts*",
91
+ "model.layers.1.self_attn.kv_a_proj_with_mqa",
92
+ "model.layers.1.self_attn.kv_b_proj",
93
+ "model.layers.1.self_attn.q_a_proj",
94
+ "model.layers.1.self_attn.q_b_proj",
95
+ "model.layers.10.mlp.shared_experts*",
96
+ "model.layers.10.self_attn.kv_a_proj_with_mqa",
97
+ "model.layers.10.self_attn.kv_b_proj",
98
+ "model.layers.10.self_attn.q_a_proj",
99
+ "model.layers.10.self_attn.q_b_proj",
100
+ "model.layers.11.mlp.shared_experts*",
101
+ "model.layers.11.self_attn.kv_a_proj_with_mqa",
102
+ "model.layers.11.self_attn.kv_b_proj",
103
+ "model.layers.11.self_attn.q_a_proj",
104
+ "model.layers.11.self_attn.q_b_proj",
105
+ "model.layers.12.mlp.shared_experts*",
106
+ "model.layers.12.self_attn.kv_a_proj_with_mqa",
107
+ "model.layers.12.self_attn.kv_b_proj",
108
+ "model.layers.12.self_attn.q_a_proj",
109
+ "model.layers.12.self_attn.q_b_proj",
110
+ "model.layers.13.mlp.shared_experts*",
111
+ "model.layers.13.self_attn.kv_a_proj_with_mqa",
112
+ "model.layers.13.self_attn.kv_b_proj",
113
+ "model.layers.13.self_attn.q_a_proj",
114
+ "model.layers.13.self_attn.q_b_proj",
115
+ "model.layers.14.mlp.shared_experts*",
116
+ "model.layers.14.self_attn.kv_a_proj_with_mqa",
117
+ "model.layers.14.self_attn.kv_b_proj",
118
+ "model.layers.14.self_attn.q_a_proj",
119
+ "model.layers.14.self_attn.q_b_proj",
120
+ "model.layers.15.mlp.shared_experts*",
121
+ "model.layers.15.self_attn.kv_a_proj_with_mqa",
122
+ "model.layers.15.self_attn.kv_b_proj",
123
+ "model.layers.15.self_attn.q_a_proj",
124
+ "model.layers.15.self_attn.q_b_proj",
125
+ "model.layers.16.mlp.shared_experts*",
126
+ "model.layers.16.self_attn.kv_a_proj_with_mqa",
127
+ "model.layers.16.self_attn.kv_b_proj",
128
+ "model.layers.16.self_attn.q_a_proj",
129
+ "model.layers.16.self_attn.q_b_proj",
130
+ "model.layers.17.mlp.shared_experts*",
131
+ "model.layers.17.self_attn.kv_a_proj_with_mqa",
132
+ "model.layers.17.self_attn.kv_b_proj",
133
+ "model.layers.17.self_attn.q_a_proj",
134
+ "model.layers.17.self_attn.q_b_proj",
135
+ "model.layers.18.mlp.shared_experts*",
136
+ "model.layers.18.self_attn.kv_a_proj_with_mqa",
137
+ "model.layers.18.self_attn.kv_b_proj",
138
+ "model.layers.18.self_attn.q_a_proj",
139
+ "model.layers.18.self_attn.q_b_proj",
140
+ "model.layers.19.mlp.shared_experts*",
141
+ "model.layers.19.self_attn.kv_a_proj_with_mqa",
142
+ "model.layers.19.self_attn.kv_b_proj",
143
+ "model.layers.19.self_attn.q_a_proj",
144
+ "model.layers.19.self_attn.q_b_proj",
145
+ "model.layers.2.mlp.shared_experts*",
146
+ "model.layers.2.self_attn.kv_a_proj_with_mqa",
147
+ "model.layers.2.self_attn.kv_b_proj",
148
+ "model.layers.2.self_attn.q_a_proj",
149
+ "model.layers.2.self_attn.q_b_proj",
150
+ "model.layers.20.mlp.shared_experts*",
151
+ "model.layers.20.self_attn.kv_a_proj_with_mqa",
152
+ "model.layers.20.self_attn.kv_b_proj",
153
+ "model.layers.20.self_attn.q_a_proj",
154
+ "model.layers.20.self_attn.q_b_proj",
155
+ "model.layers.21.mlp.shared_experts*",
156
+ "model.layers.21.self_attn.kv_a_proj_with_mqa",
157
+ "model.layers.21.self_attn.kv_b_proj",
158
+ "model.layers.21.self_attn.q_a_proj",
159
+ "model.layers.21.self_attn.q_b_proj",
160
+ "model.layers.22.mlp.shared_experts*",
161
+ "model.layers.22.self_attn.kv_a_proj_with_mqa",
162
+ "model.layers.22.self_attn.kv_b_proj",
163
+ "model.layers.22.self_attn.q_a_proj",
164
+ "model.layers.22.self_attn.q_b_proj",
165
+ "model.layers.23.mlp.shared_experts*",
166
+ "model.layers.23.self_attn.kv_a_proj_with_mqa",
167
+ "model.layers.23.self_attn.kv_b_proj",
168
+ "model.layers.23.self_attn.q_a_proj",
169
+ "model.layers.23.self_attn.q_b_proj",
170
+ "model.layers.24.mlp.shared_experts*",
171
+ "model.layers.24.self_attn.kv_a_proj_with_mqa",
172
+ "model.layers.24.self_attn.kv_b_proj",
173
+ "model.layers.24.self_attn.q_a_proj",
174
+ "model.layers.24.self_attn.q_b_proj",
175
+ "model.layers.25.mlp.shared_experts*",
176
+ "model.layers.25.self_attn.kv_a_proj_with_mqa",
177
+ "model.layers.25.self_attn.kv_b_proj",
178
+ "model.layers.25.self_attn.q_a_proj",
179
+ "model.layers.25.self_attn.q_b_proj",
180
+ "model.layers.26.mlp.shared_experts*",
181
+ "model.layers.26.self_attn.kv_a_proj_with_mqa",
182
+ "model.layers.26.self_attn.kv_b_proj",
183
+ "model.layers.26.self_attn.q_a_proj",
184
+ "model.layers.26.self_attn.q_b_proj",
185
+ "model.layers.27.mlp.shared_experts*",
186
+ "model.layers.27.self_attn.kv_a_proj_with_mqa",
187
+ "model.layers.27.self_attn.kv_b_proj",
188
+ "model.layers.27.self_attn.q_a_proj",
189
+ "model.layers.27.self_attn.q_b_proj",
190
+ "model.layers.28.mlp.shared_experts*",
191
+ "model.layers.28.self_attn.kv_a_proj_with_mqa",
192
+ "model.layers.28.self_attn.kv_b_proj",
193
+ "model.layers.28.self_attn.q_a_proj",
194
+ "model.layers.28.self_attn.q_b_proj",
195
+ "model.layers.29.mlp.shared_experts*",
196
+ "model.layers.29.self_attn.kv_a_proj_with_mqa",
197
+ "model.layers.29.self_attn.kv_b_proj",
198
+ "model.layers.29.self_attn.q_a_proj",
199
+ "model.layers.29.self_attn.q_b_proj",
200
+ "model.layers.3.mlp.shared_experts*",
201
+ "model.layers.3.self_attn.kv_a_proj_with_mqa",
202
+ "model.layers.3.self_attn.kv_b_proj",
203
+ "model.layers.3.self_attn.q_a_proj",
204
+ "model.layers.3.self_attn.q_b_proj",
205
+ "model.layers.30.mlp.shared_experts*",
206
+ "model.layers.30.self_attn.kv_a_proj_with_mqa",
207
+ "model.layers.30.self_attn.kv_b_proj",
208
+ "model.layers.30.self_attn.q_a_proj",
209
+ "model.layers.30.self_attn.q_b_proj",
210
+ "model.layers.31.mlp.shared_experts*",
211
+ "model.layers.31.self_attn.kv_a_proj_with_mqa",
212
+ "model.layers.31.self_attn.kv_b_proj",
213
+ "model.layers.31.self_attn.q_a_proj",
214
+ "model.layers.31.self_attn.q_b_proj",
215
+ "model.layers.32.mlp.shared_experts*",
216
+ "model.layers.32.self_attn.kv_a_proj_with_mqa",
217
+ "model.layers.32.self_attn.kv_b_proj",
218
+ "model.layers.32.self_attn.q_a_proj",
219
+ "model.layers.32.self_attn.q_b_proj",
220
+ "model.layers.33.mlp.shared_experts*",
221
+ "model.layers.33.self_attn.kv_a_proj_with_mqa",
222
+ "model.layers.33.self_attn.kv_b_proj",
223
+ "model.layers.33.self_attn.q_a_proj",
224
+ "model.layers.33.self_attn.q_b_proj",
225
+ "model.layers.34.mlp.shared_experts*",
226
+ "model.layers.34.self_attn.kv_a_proj_with_mqa",
227
+ "model.layers.34.self_attn.kv_b_proj",
228
+ "model.layers.34.self_attn.q_a_proj",
229
+ "model.layers.34.self_attn.q_b_proj",
230
+ "model.layers.35.mlp.shared_experts*",
231
+ "model.layers.35.self_attn.kv_a_proj_with_mqa",
232
+ "model.layers.35.self_attn.kv_b_proj",
233
+ "model.layers.35.self_attn.q_a_proj",
234
+ "model.layers.35.self_attn.q_b_proj",
235
+ "model.layers.36.mlp.shared_experts*",
236
+ "model.layers.36.self_attn.kv_a_proj_with_mqa",
237
+ "model.layers.36.self_attn.kv_b_proj",
238
+ "model.layers.36.self_attn.q_a_proj",
239
+ "model.layers.36.self_attn.q_b_proj",
240
+ "model.layers.37.mlp.shared_experts*",
241
+ "model.layers.37.self_attn.kv_a_proj_with_mqa",
242
+ "model.layers.37.self_attn.kv_b_proj",
243
+ "model.layers.37.self_attn.q_a_proj",
244
+ "model.layers.37.self_attn.q_b_proj",
245
+ "model.layers.38.mlp.shared_experts*",
246
+ "model.layers.38.self_attn.kv_a_proj_with_mqa",
247
+ "model.layers.38.self_attn.kv_b_proj",
248
+ "model.layers.38.self_attn.q_a_proj",
249
+ "model.layers.38.self_attn.q_b_proj",
250
+ "model.layers.39.mlp.shared_experts*",
251
+ "model.layers.39.self_attn.kv_a_proj_with_mqa",
252
+ "model.layers.39.self_attn.kv_b_proj",
253
+ "model.layers.39.self_attn.q_a_proj",
254
+ "model.layers.39.self_attn.q_b_proj",
255
+ "model.layers.4.mlp.shared_experts*",
256
+ "model.layers.4.self_attn.kv_a_proj_with_mqa",
257
+ "model.layers.4.self_attn.kv_b_proj",
258
+ "model.layers.4.self_attn.q_a_proj",
259
+ "model.layers.4.self_attn.q_b_proj",
260
+ "model.layers.40.mlp.shared_experts*",
261
+ "model.layers.40.self_attn.kv_a_proj_with_mqa",
262
+ "model.layers.40.self_attn.kv_b_proj",
263
+ "model.layers.40.self_attn.q_a_proj",
264
+ "model.layers.40.self_attn.q_b_proj",
265
+ "model.layers.41.mlp.shared_experts*",
266
+ "model.layers.41.self_attn.kv_a_proj_with_mqa",
267
+ "model.layers.41.self_attn.kv_b_proj",
268
+ "model.layers.41.self_attn.q_a_proj",
269
+ "model.layers.41.self_attn.q_b_proj",
270
+ "model.layers.42.mlp.shared_experts*",
271
+ "model.layers.42.self_attn.kv_a_proj_with_mqa",
272
+ "model.layers.42.self_attn.kv_b_proj",
273
+ "model.layers.42.self_attn.q_a_proj",
274
+ "model.layers.42.self_attn.q_b_proj",
275
+ "model.layers.43.mlp.shared_experts*",
276
+ "model.layers.43.self_attn.kv_a_proj_with_mqa",
277
+ "model.layers.43.self_attn.kv_b_proj",
278
+ "model.layers.43.self_attn.q_a_proj",
279
+ "model.layers.43.self_attn.q_b_proj",
280
+ "model.layers.44.mlp.shared_experts*",
281
+ "model.layers.44.self_attn.kv_a_proj_with_mqa",
282
+ "model.layers.44.self_attn.kv_b_proj",
283
+ "model.layers.44.self_attn.q_a_proj",
284
+ "model.layers.44.self_attn.q_b_proj",
285
+ "model.layers.45.mlp.shared_experts*",
286
+ "model.layers.45.self_attn.kv_a_proj_with_mqa",
287
+ "model.layers.45.self_attn.kv_b_proj",
288
+ "model.layers.45.self_attn.q_a_proj",
289
+ "model.layers.45.self_attn.q_b_proj",
290
+ "model.layers.46.mlp.shared_experts*",
291
+ "model.layers.46.self_attn.kv_a_proj_with_mqa",
292
+ "model.layers.46.self_attn.kv_b_proj",
293
+ "model.layers.46.self_attn.q_a_proj",
294
+ "model.layers.46.self_attn.q_b_proj",
295
+ "model.layers.47.mlp.shared_experts*",
296
+ "model.layers.47.self_attn.kv_a_proj_with_mqa",
297
+ "model.layers.47.self_attn.kv_b_proj",
298
+ "model.layers.47.self_attn.q_a_proj",
299
+ "model.layers.47.self_attn.q_b_proj",
300
+ "model.layers.48.mlp.shared_experts*",
301
+ "model.layers.48.self_attn.kv_a_proj_with_mqa",
302
+ "model.layers.48.self_attn.kv_b_proj",
303
+ "model.layers.48.self_attn.q_a_proj",
304
+ "model.layers.48.self_attn.q_b_proj",
305
+ "model.layers.49.mlp.shared_experts*",
306
+ "model.layers.49.self_attn.kv_a_proj_with_mqa",
307
+ "model.layers.49.self_attn.kv_b_proj",
308
+ "model.layers.49.self_attn.q_a_proj",
309
+ "model.layers.49.self_attn.q_b_proj",
310
+ "model.layers.5.mlp.shared_experts*",
311
+ "model.layers.5.self_attn.kv_a_proj_with_mqa",
312
+ "model.layers.5.self_attn.kv_b_proj",
313
+ "model.layers.5.self_attn.q_a_proj",
314
+ "model.layers.5.self_attn.q_b_proj",
315
+ "model.layers.50.mlp.shared_experts*",
316
+ "model.layers.50.self_attn.kv_a_proj_with_mqa",
317
+ "model.layers.50.self_attn.kv_b_proj",
318
+ "model.layers.50.self_attn.q_a_proj",
319
+ "model.layers.50.self_attn.q_b_proj",
320
+ "model.layers.51.mlp.shared_experts*",
321
+ "model.layers.51.self_attn.kv_a_proj_with_mqa",
322
+ "model.layers.51.self_attn.kv_b_proj",
323
+ "model.layers.51.self_attn.q_a_proj",
324
+ "model.layers.51.self_attn.q_b_proj",
325
+ "model.layers.52.mlp.shared_experts*",
326
+ "model.layers.52.self_attn.kv_a_proj_with_mqa",
327
+ "model.layers.52.self_attn.kv_b_proj",
328
+ "model.layers.52.self_attn.q_a_proj",
329
+ "model.layers.52.self_attn.q_b_proj",
330
+ "model.layers.53.mlp.shared_experts*",
331
+ "model.layers.53.self_attn.kv_a_proj_with_mqa",
332
+ "model.layers.53.self_attn.kv_b_proj",
333
+ "model.layers.53.self_attn.q_a_proj",
334
+ "model.layers.53.self_attn.q_b_proj",
335
+ "model.layers.54.mlp.shared_experts*",
336
+ "model.layers.54.self_attn.kv_a_proj_with_mqa",
337
+ "model.layers.54.self_attn.kv_b_proj",
338
+ "model.layers.54.self_attn.q_a_proj",
339
+ "model.layers.54.self_attn.q_b_proj",
340
+ "model.layers.55.mlp.shared_experts*",
341
+ "model.layers.55.self_attn.kv_a_proj_with_mqa",
342
+ "model.layers.55.self_attn.kv_b_proj",
343
+ "model.layers.55.self_attn.q_a_proj",
344
+ "model.layers.55.self_attn.q_b_proj",
345
+ "model.layers.56.mlp.shared_experts*",
346
+ "model.layers.56.self_attn.kv_a_proj_with_mqa",
347
+ "model.layers.56.self_attn.kv_b_proj",
348
+ "model.layers.56.self_attn.q_a_proj",
349
+ "model.layers.56.self_attn.q_b_proj",
350
+ "model.layers.57.mlp.shared_experts*",
351
+ "model.layers.57.self_attn.kv_a_proj_with_mqa",
352
+ "model.layers.57.self_attn.kv_b_proj",
353
+ "model.layers.57.self_attn.q_a_proj",
354
+ "model.layers.57.self_attn.q_b_proj",
355
+ "model.layers.58.mlp.shared_experts*",
356
+ "model.layers.58.self_attn.kv_a_proj_with_mqa",
357
+ "model.layers.58.self_attn.kv_b_proj",
358
+ "model.layers.58.self_attn.q_a_proj",
359
+ "model.layers.58.self_attn.q_b_proj",
360
+ "model.layers.59.mlp.shared_experts*",
361
+ "model.layers.59.self_attn.kv_a_proj_with_mqa",
362
+ "model.layers.59.self_attn.kv_b_proj",
363
+ "model.layers.59.self_attn.q_a_proj",
364
+ "model.layers.59.self_attn.q_b_proj",
365
+ "model.layers.6.mlp.shared_experts*",
366
+ "model.layers.6.self_attn.kv_a_proj_with_mqa",
367
+ "model.layers.6.self_attn.kv_b_proj",
368
+ "model.layers.6.self_attn.q_a_proj",
369
+ "model.layers.6.self_attn.q_b_proj",
370
+ "model.layers.60.mlp*",
371
+ "model.layers.60.self_attn.kv_a_proj_with_mqa",
372
+ "model.layers.60.self_attn.kv_b_proj",
373
+ "model.layers.60.self_attn.q_a_proj",
374
+ "model.layers.60.self_attn.q_b_proj",
375
+ "model.layers.7.mlp.shared_experts*",
376
+ "model.layers.7.self_attn.kv_a_proj_with_mqa",
377
+ "model.layers.7.self_attn.kv_b_proj",
378
+ "model.layers.7.self_attn.q_a_proj",
379
+ "model.layers.7.self_attn.q_b_proj",
380
+ "model.layers.8.mlp.shared_experts*",
381
+ "model.layers.8.self_attn.kv_a_proj_with_mqa",
382
+ "model.layers.8.self_attn.kv_b_proj",
383
+ "model.layers.8.self_attn.q_a_proj",
384
+ "model.layers.8.self_attn.q_b_proj",
385
+ "model.layers.9.mlp.shared_experts*",
386
+ "model.layers.9.self_attn.kv_a_proj_with_mqa",
387
+ "model.layers.9.self_attn.kv_b_proj",
388
+ "model.layers.9.self_attn.q_a_proj",
389
+ "model.layers.9.self_attn.q_b_proj"
390
+ ],
391
+ "quant_algo": "NVFP4",
392
+ "producer": {
393
+ "name": "modelopt",
394
+ "version": "0.41.0rc2.dev63+g2a467531f.d20260203"
395
+ },
396
+ "quant_method": "modelopt"
397
+ }
398
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
9
+ class DeepseekV3Config(PretrainedConfig):
10
+ r"""
11
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
12
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
13
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
14
+
15
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
16
+ documentation from [`PretrainedConfig`] for more information.
17
+
18
+
19
+ Args:
20
+ vocab_size (`int`, *optional*, defaults to 129280):
21
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
22
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
23
+ hidden_size (`int`, *optional*, defaults to 4096):
24
+ Dimension of the hidden representations.
25
+ intermediate_size (`int`, *optional*, defaults to 11008):
26
+ Dimension of the MLP representations.
27
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
28
+ Dimension of the MoE representations.
29
+ num_hidden_layers (`int`, *optional*, defaults to 32):
30
+ Number of hidden layers in the Transformer decoder.
31
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
32
+ Number of nextn predict layers in the DeepSeekV3 Model.
33
+ num_attention_heads (`int`, *optional*, defaults to 32):
34
+ Number of attention heads for each attention layer in the Transformer decoder.
35
+ n_shared_experts (`int`, *optional*, defaults to None):
36
+ Number of shared experts, None means dense model.
37
+ n_routed_experts (`int`, *optional*, defaults to None):
38
+ Number of routed experts, None means dense model.
39
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
40
+ Scaling factor or routed experts.
41
+ topk_method (`str`, *optional*, defaults to `gready`):
42
+ Topk method used in routed gate.
43
+ n_group (`int`, *optional*, defaults to None):
44
+ Number of groups for routed experts.
45
+ topk_group (`int`, *optional*, defaults to None):
46
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
47
+ num_experts_per_tok (`int`, *optional*, defaults to None):
48
+ Number of selected experts, None means dense model.
49
+ moe_layer_freq (`int`, *optional*, defaults to 1):
50
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
51
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
52
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
53
+ \--k dense layers--/
54
+ norm_topk_prob (`bool`, *optional*, defaults to False):
55
+ Whether to normalize the weights of the routed experts.
56
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
57
+ Method of computing expert weights.
58
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
59
+ Auxiliary loss weight coefficient.
60
+ seq_aux = (`bool`, *optional*, defaults to True):
61
+ Whether to compute the auxiliary loss for each individual sample.
62
+ num_key_value_heads (`int`, *optional*):
63
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
64
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
65
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
66
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
67
+ by meanpooling all the original heads within that group. For more details checkout [this
68
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
69
+ `num_attention_heads`.
70
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
71
+ The non-linear activation function (function or string) in the decoder.
72
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
73
+ The maximum sequence length that this model might ever be used with.
74
+ initializer_range (`float`, *optional*, defaults to 0.02):
75
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
76
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
77
+ The epsilon used by the rms normalization layers.
78
+ use_cache (`bool`, *optional*, defaults to `True`):
79
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
80
+ relevant if `config.is_decoder=True`.
81
+ pad_token_id (`int`, *optional*):
82
+ Padding token id.
83
+ bos_token_id (`int`, *optional*, defaults to 1):
84
+ Beginning of stream token id.
85
+ eos_token_id (`int`, *optional*, defaults to 2):
86
+ End of stream token id.
87
+ pretraining_tp (`int`, *optional*, defaults to 1):
88
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
89
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
90
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
91
+ issue](https://github.com/pytorch/pytorch/issues/76232).
92
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
93
+ Whether to tie weight embeddings
94
+ rope_theta (`float`, *optional*, defaults to 10000.0):
95
+ The base period of the RoPE embeddings.
96
+ rope_scaling (`Dict`, *optional*):
97
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
98
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
99
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
100
+ `max_position_embeddings` to the expected new maximum.
101
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
102
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
103
+ attention_dropout (`float`, *optional*, defaults to 0.0):
104
+ The dropout ratio for the attention probabilities.
105
+
106
+ ```python
107
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
108
+
109
+ >>> # Initializing a Deepseek-V3 style configuration
110
+ >>> configuration = DeepseekV3Config()
111
+
112
+ >>> # Accessing the model configuration
113
+ >>> configuration = model.config
114
+ ```"""
115
+
116
+ model_type = "deepseek_v3"
117
+ keys_to_ignore_at_inference = ["past_key_values"]
118
+
119
+ def __init__(
120
+ self,
121
+ vocab_size=129280,
122
+ hidden_size=7168,
123
+ intermediate_size=18432,
124
+ moe_intermediate_size = 2048,
125
+ num_hidden_layers=61,
126
+ num_nextn_predict_layers=1,
127
+ num_attention_heads=128,
128
+ num_key_value_heads=128,
129
+ n_shared_experts = 1,
130
+ n_routed_experts = 256,
131
+ ep_size = 1,
132
+ routed_scaling_factor = 2.5,
133
+ kv_lora_rank = 512,
134
+ q_lora_rank = 1536,
135
+ qk_rope_head_dim = 64,
136
+ v_head_dim = 128,
137
+ qk_nope_head_dim = 128,
138
+ topk_method = 'noaux_tc',
139
+ n_group = 8,
140
+ topk_group = 4,
141
+ num_experts_per_tok = 8,
142
+ moe_layer_freq = 1,
143
+ first_k_dense_replace = 3,
144
+ norm_topk_prob = True,
145
+ scoring_func = 'sigmoid',
146
+ aux_loss_alpha = 0.001,
147
+ seq_aux = True,
148
+ hidden_act="silu",
149
+ max_position_embeddings=4096,
150
+ initializer_range=0.02,
151
+ rms_norm_eps=1e-6,
152
+ use_cache=True,
153
+ pad_token_id=None,
154
+ bos_token_id=0,
155
+ eos_token_id=1,
156
+ pretraining_tp=1,
157
+ tie_word_embeddings=False,
158
+ rope_theta=10000.0,
159
+ rope_scaling=None,
160
+ attention_bias=False,
161
+ attention_dropout=0.0,
162
+ **kwargs,
163
+ ):
164
+ self.vocab_size = vocab_size
165
+ self.max_position_embeddings = max_position_embeddings
166
+ self.hidden_size = hidden_size
167
+ self.intermediate_size = intermediate_size
168
+ self.moe_intermediate_size = moe_intermediate_size
169
+ self.num_hidden_layers = num_hidden_layers
170
+ self.num_nextn_predict_layers = num_nextn_predict_layers
171
+ self.num_attention_heads = num_attention_heads
172
+ self.n_shared_experts = n_shared_experts
173
+ self.n_routed_experts = n_routed_experts
174
+ self.ep_size = ep_size
175
+ self.routed_scaling_factor = routed_scaling_factor
176
+ self.kv_lora_rank = kv_lora_rank
177
+ self.q_lora_rank = q_lora_rank
178
+ self.qk_rope_head_dim = qk_rope_head_dim
179
+ self.v_head_dim = v_head_dim
180
+ self.qk_nope_head_dim = qk_nope_head_dim
181
+ self.topk_method = topk_method
182
+ self.n_group = n_group
183
+ self.topk_group = topk_group
184
+ self.num_experts_per_tok = num_experts_per_tok
185
+ self.moe_layer_freq = moe_layer_freq
186
+ self.first_k_dense_replace = first_k_dense_replace
187
+ self.norm_topk_prob = norm_topk_prob
188
+ self.scoring_func = scoring_func
189
+ self.aux_loss_alpha = aux_loss_alpha
190
+ self.seq_aux = seq_aux
191
+ # for backward compatibility
192
+ if num_key_value_heads is None:
193
+ num_key_value_heads = num_attention_heads
194
+
195
+ self.num_key_value_heads = num_key_value_heads
196
+ self.hidden_act = hidden_act
197
+ self.initializer_range = initializer_range
198
+ self.rms_norm_eps = rms_norm_eps
199
+ self.pretraining_tp = pretraining_tp
200
+ self.use_cache = use_cache
201
+ self.rope_theta = rope_theta
202
+ self.rope_scaling = rope_scaling
203
+ self.attention_bias = attention_bias
204
+ self.attention_dropout = attention_dropout
205
+
206
+ super().__init__(
207
+ pad_token_id=pad_token_id,
208
+ bos_token_id=bos_token_id,
209
+ eos_token_id=eos_token_id,
210
+ tie_word_embeddings=tie_word_embeddings,
211
+ **kwargs,
212
+ )
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_length": 262144,
3
+ "eos_token_id": 163586
4
+ }
hf_quant_config.json ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "producer": {
3
+ "name": "modelopt",
4
+ "version": "0.41.0rc2.dev63+g2a467531f.d20260203"
5
+ },
6
+ "quantization": {
7
+ "quant_algo": "NVFP4",
8
+ "kv_cache_quant_algo": null,
9
+ "group_size": 16,
10
+ "exclude_modules": [
11
+ "lm_head",
12
+ "model.layers.0.mlp*",
13
+ "model.layers.0.self_attn.kv_a_proj_with_mqa",
14
+ "model.layers.0.self_attn.kv_b_proj",
15
+ "model.layers.0.self_attn.q_a_proj",
16
+ "model.layers.0.self_attn.q_b_proj",
17
+ "model.layers.1.mlp.shared_experts*",
18
+ "model.layers.1.self_attn.kv_a_proj_with_mqa",
19
+ "model.layers.1.self_attn.kv_b_proj",
20
+ "model.layers.1.self_attn.q_a_proj",
21
+ "model.layers.1.self_attn.q_b_proj",
22
+ "model.layers.10.mlp.shared_experts*",
23
+ "model.layers.10.self_attn.kv_a_proj_with_mqa",
24
+ "model.layers.10.self_attn.kv_b_proj",
25
+ "model.layers.10.self_attn.q_a_proj",
26
+ "model.layers.10.self_attn.q_b_proj",
27
+ "model.layers.11.mlp.shared_experts*",
28
+ "model.layers.11.self_attn.kv_a_proj_with_mqa",
29
+ "model.layers.11.self_attn.kv_b_proj",
30
+ "model.layers.11.self_attn.q_a_proj",
31
+ "model.layers.11.self_attn.q_b_proj",
32
+ "model.layers.12.mlp.shared_experts*",
33
+ "model.layers.12.self_attn.kv_a_proj_with_mqa",
34
+ "model.layers.12.self_attn.kv_b_proj",
35
+ "model.layers.12.self_attn.q_a_proj",
36
+ "model.layers.12.self_attn.q_b_proj",
37
+ "model.layers.13.mlp.shared_experts*",
38
+ "model.layers.13.self_attn.kv_a_proj_with_mqa",
39
+ "model.layers.13.self_attn.kv_b_proj",
40
+ "model.layers.13.self_attn.q_a_proj",
41
+ "model.layers.13.self_attn.q_b_proj",
42
+ "model.layers.14.mlp.shared_experts*",
43
+ "model.layers.14.self_attn.kv_a_proj_with_mqa",
44
+ "model.layers.14.self_attn.kv_b_proj",
45
+ "model.layers.14.self_attn.q_a_proj",
46
+ "model.layers.14.self_attn.q_b_proj",
47
+ "model.layers.15.mlp.shared_experts*",
48
+ "model.layers.15.self_attn.kv_a_proj_with_mqa",
49
+ "model.layers.15.self_attn.kv_b_proj",
50
+ "model.layers.15.self_attn.q_a_proj",
51
+ "model.layers.15.self_attn.q_b_proj",
52
+ "model.layers.16.mlp.shared_experts*",
53
+ "model.layers.16.self_attn.kv_a_proj_with_mqa",
54
+ "model.layers.16.self_attn.kv_b_proj",
55
+ "model.layers.16.self_attn.q_a_proj",
56
+ "model.layers.16.self_attn.q_b_proj",
57
+ "model.layers.17.mlp.shared_experts*",
58
+ "model.layers.17.self_attn.kv_a_proj_with_mqa",
59
+ "model.layers.17.self_attn.kv_b_proj",
60
+ "model.layers.17.self_attn.q_a_proj",
61
+ "model.layers.17.self_attn.q_b_proj",
62
+ "model.layers.18.mlp.shared_experts*",
63
+ "model.layers.18.self_attn.kv_a_proj_with_mqa",
64
+ "model.layers.18.self_attn.kv_b_proj",
65
+ "model.layers.18.self_attn.q_a_proj",
66
+ "model.layers.18.self_attn.q_b_proj",
67
+ "model.layers.19.mlp.shared_experts*",
68
+ "model.layers.19.self_attn.kv_a_proj_with_mqa",
69
+ "model.layers.19.self_attn.kv_b_proj",
70
+ "model.layers.19.self_attn.q_a_proj",
71
+ "model.layers.19.self_attn.q_b_proj",
72
+ "model.layers.2.mlp.shared_experts*",
73
+ "model.layers.2.self_attn.kv_a_proj_with_mqa",
74
+ "model.layers.2.self_attn.kv_b_proj",
75
+ "model.layers.2.self_attn.q_a_proj",
76
+ "model.layers.2.self_attn.q_b_proj",
77
+ "model.layers.20.mlp.shared_experts*",
78
+ "model.layers.20.self_attn.kv_a_proj_with_mqa",
79
+ "model.layers.20.self_attn.kv_b_proj",
80
+ "model.layers.20.self_attn.q_a_proj",
81
+ "model.layers.20.self_attn.q_b_proj",
82
+ "model.layers.21.mlp.shared_experts*",
83
+ "model.layers.21.self_attn.kv_a_proj_with_mqa",
84
+ "model.layers.21.self_attn.kv_b_proj",
85
+ "model.layers.21.self_attn.q_a_proj",
86
+ "model.layers.21.self_attn.q_b_proj",
87
+ "model.layers.22.mlp.shared_experts*",
88
+ "model.layers.22.self_attn.kv_a_proj_with_mqa",
89
+ "model.layers.22.self_attn.kv_b_proj",
90
+ "model.layers.22.self_attn.q_a_proj",
91
+ "model.layers.22.self_attn.q_b_proj",
92
+ "model.layers.23.mlp.shared_experts*",
93
+ "model.layers.23.self_attn.kv_a_proj_with_mqa",
94
+ "model.layers.23.self_attn.kv_b_proj",
95
+ "model.layers.23.self_attn.q_a_proj",
96
+ "model.layers.23.self_attn.q_b_proj",
97
+ "model.layers.24.mlp.shared_experts*",
98
+ "model.layers.24.self_attn.kv_a_proj_with_mqa",
99
+ "model.layers.24.self_attn.kv_b_proj",
100
+ "model.layers.24.self_attn.q_a_proj",
101
+ "model.layers.24.self_attn.q_b_proj",
102
+ "model.layers.25.mlp.shared_experts*",
103
+ "model.layers.25.self_attn.kv_a_proj_with_mqa",
104
+ "model.layers.25.self_attn.kv_b_proj",
105
+ "model.layers.25.self_attn.q_a_proj",
106
+ "model.layers.25.self_attn.q_b_proj",
107
+ "model.layers.26.mlp.shared_experts*",
108
+ "model.layers.26.self_attn.kv_a_proj_with_mqa",
109
+ "model.layers.26.self_attn.kv_b_proj",
110
+ "model.layers.26.self_attn.q_a_proj",
111
+ "model.layers.26.self_attn.q_b_proj",
112
+ "model.layers.27.mlp.shared_experts*",
113
+ "model.layers.27.self_attn.kv_a_proj_with_mqa",
114
+ "model.layers.27.self_attn.kv_b_proj",
115
+ "model.layers.27.self_attn.q_a_proj",
116
+ "model.layers.27.self_attn.q_b_proj",
117
+ "model.layers.28.mlp.shared_experts*",
118
+ "model.layers.28.self_attn.kv_a_proj_with_mqa",
119
+ "model.layers.28.self_attn.kv_b_proj",
120
+ "model.layers.28.self_attn.q_a_proj",
121
+ "model.layers.28.self_attn.q_b_proj",
122
+ "model.layers.29.mlp.shared_experts*",
123
+ "model.layers.29.self_attn.kv_a_proj_with_mqa",
124
+ "model.layers.29.self_attn.kv_b_proj",
125
+ "model.layers.29.self_attn.q_a_proj",
126
+ "model.layers.29.self_attn.q_b_proj",
127
+ "model.layers.3.mlp.shared_experts*",
128
+ "model.layers.3.self_attn.kv_a_proj_with_mqa",
129
+ "model.layers.3.self_attn.kv_b_proj",
130
+ "model.layers.3.self_attn.q_a_proj",
131
+ "model.layers.3.self_attn.q_b_proj",
132
+ "model.layers.30.mlp.shared_experts*",
133
+ "model.layers.30.self_attn.kv_a_proj_with_mqa",
134
+ "model.layers.30.self_attn.kv_b_proj",
135
+ "model.layers.30.self_attn.q_a_proj",
136
+ "model.layers.30.self_attn.q_b_proj",
137
+ "model.layers.31.mlp.shared_experts*",
138
+ "model.layers.31.self_attn.kv_a_proj_with_mqa",
139
+ "model.layers.31.self_attn.kv_b_proj",
140
+ "model.layers.31.self_attn.q_a_proj",
141
+ "model.layers.31.self_attn.q_b_proj",
142
+ "model.layers.32.mlp.shared_experts*",
143
+ "model.layers.32.self_attn.kv_a_proj_with_mqa",
144
+ "model.layers.32.self_attn.kv_b_proj",
145
+ "model.layers.32.self_attn.q_a_proj",
146
+ "model.layers.32.self_attn.q_b_proj",
147
+ "model.layers.33.mlp.shared_experts*",
148
+ "model.layers.33.self_attn.kv_a_proj_with_mqa",
149
+ "model.layers.33.self_attn.kv_b_proj",
150
+ "model.layers.33.self_attn.q_a_proj",
151
+ "model.layers.33.self_attn.q_b_proj",
152
+ "model.layers.34.mlp.shared_experts*",
153
+ "model.layers.34.self_attn.kv_a_proj_with_mqa",
154
+ "model.layers.34.self_attn.kv_b_proj",
155
+ "model.layers.34.self_attn.q_a_proj",
156
+ "model.layers.34.self_attn.q_b_proj",
157
+ "model.layers.35.mlp.shared_experts*",
158
+ "model.layers.35.self_attn.kv_a_proj_with_mqa",
159
+ "model.layers.35.self_attn.kv_b_proj",
160
+ "model.layers.35.self_attn.q_a_proj",
161
+ "model.layers.35.self_attn.q_b_proj",
162
+ "model.layers.36.mlp.shared_experts*",
163
+ "model.layers.36.self_attn.kv_a_proj_with_mqa",
164
+ "model.layers.36.self_attn.kv_b_proj",
165
+ "model.layers.36.self_attn.q_a_proj",
166
+ "model.layers.36.self_attn.q_b_proj",
167
+ "model.layers.37.mlp.shared_experts*",
168
+ "model.layers.37.self_attn.kv_a_proj_with_mqa",
169
+ "model.layers.37.self_attn.kv_b_proj",
170
+ "model.layers.37.self_attn.q_a_proj",
171
+ "model.layers.37.self_attn.q_b_proj",
172
+ "model.layers.38.mlp.shared_experts*",
173
+ "model.layers.38.self_attn.kv_a_proj_with_mqa",
174
+ "model.layers.38.self_attn.kv_b_proj",
175
+ "model.layers.38.self_attn.q_a_proj",
176
+ "model.layers.38.self_attn.q_b_proj",
177
+ "model.layers.39.mlp.shared_experts*",
178
+ "model.layers.39.self_attn.kv_a_proj_with_mqa",
179
+ "model.layers.39.self_attn.kv_b_proj",
180
+ "model.layers.39.self_attn.q_a_proj",
181
+ "model.layers.39.self_attn.q_b_proj",
182
+ "model.layers.4.mlp.shared_experts*",
183
+ "model.layers.4.self_attn.kv_a_proj_with_mqa",
184
+ "model.layers.4.self_attn.kv_b_proj",
185
+ "model.layers.4.self_attn.q_a_proj",
186
+ "model.layers.4.self_attn.q_b_proj",
187
+ "model.layers.40.mlp.shared_experts*",
188
+ "model.layers.40.self_attn.kv_a_proj_with_mqa",
189
+ "model.layers.40.self_attn.kv_b_proj",
190
+ "model.layers.40.self_attn.q_a_proj",
191
+ "model.layers.40.self_attn.q_b_proj",
192
+ "model.layers.41.mlp.shared_experts*",
193
+ "model.layers.41.self_attn.kv_a_proj_with_mqa",
194
+ "model.layers.41.self_attn.kv_b_proj",
195
+ "model.layers.41.self_attn.q_a_proj",
196
+ "model.layers.41.self_attn.q_b_proj",
197
+ "model.layers.42.mlp.shared_experts*",
198
+ "model.layers.42.self_attn.kv_a_proj_with_mqa",
199
+ "model.layers.42.self_attn.kv_b_proj",
200
+ "model.layers.42.self_attn.q_a_proj",
201
+ "model.layers.42.self_attn.q_b_proj",
202
+ "model.layers.43.mlp.shared_experts*",
203
+ "model.layers.43.self_attn.kv_a_proj_with_mqa",
204
+ "model.layers.43.self_attn.kv_b_proj",
205
+ "model.layers.43.self_attn.q_a_proj",
206
+ "model.layers.43.self_attn.q_b_proj",
207
+ "model.layers.44.mlp.shared_experts*",
208
+ "model.layers.44.self_attn.kv_a_proj_with_mqa",
209
+ "model.layers.44.self_attn.kv_b_proj",
210
+ "model.layers.44.self_attn.q_a_proj",
211
+ "model.layers.44.self_attn.q_b_proj",
212
+ "model.layers.45.mlp.shared_experts*",
213
+ "model.layers.45.self_attn.kv_a_proj_with_mqa",
214
+ "model.layers.45.self_attn.kv_b_proj",
215
+ "model.layers.45.self_attn.q_a_proj",
216
+ "model.layers.45.self_attn.q_b_proj",
217
+ "model.layers.46.mlp.shared_experts*",
218
+ "model.layers.46.self_attn.kv_a_proj_with_mqa",
219
+ "model.layers.46.self_attn.kv_b_proj",
220
+ "model.layers.46.self_attn.q_a_proj",
221
+ "model.layers.46.self_attn.q_b_proj",
222
+ "model.layers.47.mlp.shared_experts*",
223
+ "model.layers.47.self_attn.kv_a_proj_with_mqa",
224
+ "model.layers.47.self_attn.kv_b_proj",
225
+ "model.layers.47.self_attn.q_a_proj",
226
+ "model.layers.47.self_attn.q_b_proj",
227
+ "model.layers.48.mlp.shared_experts*",
228
+ "model.layers.48.self_attn.kv_a_proj_with_mqa",
229
+ "model.layers.48.self_attn.kv_b_proj",
230
+ "model.layers.48.self_attn.q_a_proj",
231
+ "model.layers.48.self_attn.q_b_proj",
232
+ "model.layers.49.mlp.shared_experts*",
233
+ "model.layers.49.self_attn.kv_a_proj_with_mqa",
234
+ "model.layers.49.self_attn.kv_b_proj",
235
+ "model.layers.49.self_attn.q_a_proj",
236
+ "model.layers.49.self_attn.q_b_proj",
237
+ "model.layers.5.mlp.shared_experts*",
238
+ "model.layers.5.self_attn.kv_a_proj_with_mqa",
239
+ "model.layers.5.self_attn.kv_b_proj",
240
+ "model.layers.5.self_attn.q_a_proj",
241
+ "model.layers.5.self_attn.q_b_proj",
242
+ "model.layers.50.mlp.shared_experts*",
243
+ "model.layers.50.self_attn.kv_a_proj_with_mqa",
244
+ "model.layers.50.self_attn.kv_b_proj",
245
+ "model.layers.50.self_attn.q_a_proj",
246
+ "model.layers.50.self_attn.q_b_proj",
247
+ "model.layers.51.mlp.shared_experts*",
248
+ "model.layers.51.self_attn.kv_a_proj_with_mqa",
249
+ "model.layers.51.self_attn.kv_b_proj",
250
+ "model.layers.51.self_attn.q_a_proj",
251
+ "model.layers.51.self_attn.q_b_proj",
252
+ "model.layers.52.mlp.shared_experts*",
253
+ "model.layers.52.self_attn.kv_a_proj_with_mqa",
254
+ "model.layers.52.self_attn.kv_b_proj",
255
+ "model.layers.52.self_attn.q_a_proj",
256
+ "model.layers.52.self_attn.q_b_proj",
257
+ "model.layers.53.mlp.shared_experts*",
258
+ "model.layers.53.self_attn.kv_a_proj_with_mqa",
259
+ "model.layers.53.self_attn.kv_b_proj",
260
+ "model.layers.53.self_attn.q_a_proj",
261
+ "model.layers.53.self_attn.q_b_proj",
262
+ "model.layers.54.mlp.shared_experts*",
263
+ "model.layers.54.self_attn.kv_a_proj_with_mqa",
264
+ "model.layers.54.self_attn.kv_b_proj",
265
+ "model.layers.54.self_attn.q_a_proj",
266
+ "model.layers.54.self_attn.q_b_proj",
267
+ "model.layers.55.mlp.shared_experts*",
268
+ "model.layers.55.self_attn.kv_a_proj_with_mqa",
269
+ "model.layers.55.self_attn.kv_b_proj",
270
+ "model.layers.55.self_attn.q_a_proj",
271
+ "model.layers.55.self_attn.q_b_proj",
272
+ "model.layers.56.mlp.shared_experts*",
273
+ "model.layers.56.self_attn.kv_a_proj_with_mqa",
274
+ "model.layers.56.self_attn.kv_b_proj",
275
+ "model.layers.56.self_attn.q_a_proj",
276
+ "model.layers.56.self_attn.q_b_proj",
277
+ "model.layers.57.mlp.shared_experts*",
278
+ "model.layers.57.self_attn.kv_a_proj_with_mqa",
279
+ "model.layers.57.self_attn.kv_b_proj",
280
+ "model.layers.57.self_attn.q_a_proj",
281
+ "model.layers.57.self_attn.q_b_proj",
282
+ "model.layers.58.mlp.shared_experts*",
283
+ "model.layers.58.self_attn.kv_a_proj_with_mqa",
284
+ "model.layers.58.self_attn.kv_b_proj",
285
+ "model.layers.58.self_attn.q_a_proj",
286
+ "model.layers.58.self_attn.q_b_proj",
287
+ "model.layers.59.mlp.shared_experts*",
288
+ "model.layers.59.self_attn.kv_a_proj_with_mqa",
289
+ "model.layers.59.self_attn.kv_b_proj",
290
+ "model.layers.59.self_attn.q_a_proj",
291
+ "model.layers.59.self_attn.q_b_proj",
292
+ "model.layers.6.mlp.shared_experts*",
293
+ "model.layers.6.self_attn.kv_a_proj_with_mqa",
294
+ "model.layers.6.self_attn.kv_b_proj",
295
+ "model.layers.6.self_attn.q_a_proj",
296
+ "model.layers.6.self_attn.q_b_proj",
297
+ "model.layers.60.mlp*",
298
+ "model.layers.60.self_attn.kv_a_proj_with_mqa",
299
+ "model.layers.60.self_attn.kv_b_proj",
300
+ "model.layers.60.self_attn.q_a_proj",
301
+ "model.layers.60.self_attn.q_b_proj",
302
+ "model.layers.7.mlp.shared_experts*",
303
+ "model.layers.7.self_attn.kv_a_proj_with_mqa",
304
+ "model.layers.7.self_attn.kv_b_proj",
305
+ "model.layers.7.self_attn.q_a_proj",
306
+ "model.layers.7.self_attn.q_b_proj",
307
+ "model.layers.8.mlp.shared_experts*",
308
+ "model.layers.8.self_attn.kv_a_proj_with_mqa",
309
+ "model.layers.8.self_attn.kv_b_proj",
310
+ "model.layers.8.self_attn.q_a_proj",
311
+ "model.layers.8.self_attn.q_b_proj",
312
+ "model.layers.9.mlp.shared_experts*",
313
+ "model.layers.9.self_attn.kv_a_proj_with_mqa",
314
+ "model.layers.9.self_attn.kv_b_proj",
315
+ "model.layers.9.self_attn.q_a_proj",
316
+ "model.layers.9.self_attn.q_b_proj"
317
+ ]
318
+ }
319
+ }
model-00001-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:701f6a0b2f455a562af6e107da275359e4f270a7a23658d22aed701be28d3c8c
3
+ size 4995811616
model-00002-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91bb60b9868d48fd7aaa1f6adc8af55fef243e0573f7439f0923596781b090d8
3
+ size 4996099800
model-00003-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7672515158e53a0f23f0281a2c8e8b69a31b849ed99cfc5ae176930d02a5323
3
+ size 5000185776
model-00004-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bd025bc9f0c1950d2e096d9271f39902de74306a6244f0ef9a40caf0c3542a3
3
+ size 4997017792
model-00005-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e066cfc215e066514a795f8c2dacb33ef1a8dc122e240adaad6a8fffcc4f8800
3
+ size 5000185648
model-00006-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:943030ed1f1e5fb864659707f89e8fe70f7e57feda8ff62ff83285ca62338856
3
+ size 4997017928
model-00007-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cd1839399ac4b4812eeabf7264d7c7c6671555516c39acf0e4240e918b6634e
3
+ size 5000185512
model-00008-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93920aaa7280401685c148240ad47804e3a1e0831c1889a3f98d65c76d31bf87
3
+ size 4997018056
model-00009-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e80b17be3f9a444f4f07a9e5151da4d788bf094f9acc74bc6dedd72f7ec8a2f9
3
+ size 5000185488
model-00010-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69c37caa48f15073749d48a2648b2900689b371637deb1b16c9ecce8e0d782ec
3
+ size 4997018080
model-00011-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec9b5ea94018b4933795c06bc8270313003fbd2e9df045f74d994f9ae026629d
3
+ size 5000185488
model-00012-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e3a5b57614c7abe990c9d4a012e968754f3c42e055c9aa4478342b7022fb8eb
3
+ size 4997018080
model-00013-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fee4d67b48fb59ac5b3cad150b8430086b0afc4dccca996d454ed60e8e422553
3
+ size 5000185488
model-00014-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68b45a33f750d79d09a8d05b1d9cba3ee7a6d1033a84cea4ad44da7c68b26f93
3
+ size 4997018080
model-00015-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58c313dc9741527a93411066f2b1813d99b7010483baf4bde33f41dadb5df316
3
+ size 5000185488
model-00016-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d23ff9770afeed1ec329e740b8a0892bab68384d0c35d9e7e02ec7e864e61591
3
+ size 4997018080
model-00017-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41856796d1c52aba359e7c2338a26ec0b76ed26c5164789d28fe1b37d25259ed
3
+ size 5000185488
model-00018-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04ec05203b7b1abf8e03ba6e93778bbf0d18387507f39890d81da522b3b4218e
3
+ size 4997018080
model-00019-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71340dbad8f627bc2f9ffed8298acde0c77337885d468ec94e875c2ac3994367
3
+ size 5000187472
model-00020-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:849fb04b2fd9d7f286cce5ff03ee646be069f5dfdd24a108ffd844c3ae88e63e
3
+ size 4997020504
model-00021-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf168c5407fa2a1d4353f777eb4c52a29881e772ead2ed7d60e0f5ad9f960d4b
3
+ size 5000187824
model-00022-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19eacc88762a3fcec1100b36c0fb1355b0af1315f73ec099544a16a13ec10d0b
3
+ size 4997020504
model-00023-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add1dea0ba0c8b46a050416a02b6617ff648b05fde13d5da62af69294fbaf261
3
+ size 5000187824
model-00024-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9c4f7793dbb75499ba1fa0814ab23b8f866564fdf9426cf54f5e0c768579957
3
+ size 4991540144
model-00025-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:174972d6a09ce930a044cf654114893c3877783a5947ec7c1c0428e6dbbaf759
3
+ size 4998328208
model-00026-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090683e616106f9e3998a711877af28e57474eb98bf9ea55b0ebb1f904d8d0fa
3
+ size 5000189000
model-00027-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5d465c31be51b189e1a8e828d3e616308b912db83a5344549752a0f61f0ced4
3
+ size 4997019328
model-00028-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1989e804121fccbd961e85fdb9a16c549a61672a46271a4b3642f0fda48710da
3
+ size 5000188824
model-00029-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58fa25c270601e5a4f874d0b0577538b93cd93fa449f2bb8d4eb106dd71497db
3
+ size 4997019512
model-00030-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecf6d8ecd39d5eeff8d54aa37602b32329c0a38333bb69930afa1e3506e9bb9a
3
+ size 5000188688
model-00031-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a1ebd7423289b81033b79444841a045ce1d186267ce69982ee78f485ee2bc1c
3
+ size 4997019640
model-00032-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1879efa5ab69876bafe023b8edc6ba67810a612137d247173b31441413ea844
3
+ size 5000188560
model-00033-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04452e463917eea39202b50ecba189f0426af3860896cdebb0ab41c1b71a7205
3
+ size 4997019776
model-00034-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecab83d3559e5a9450bf1eed41e362df4192f7b7118a89f9ac42e59197e1ee4f
3
+ size 5000188424
model-00035-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27c95af63d38683b5a002db5fdb31a5a9867cdcdc4dd8dc2312d24a8f03e8f60
3
+ size 4997019904
model-00036-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0fa682bdb39418d80c8dc20e280ccfdb135ccaccae3284293a9cfc61b3a7577
3
+ size 5000188296
model-00037-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d751412dd052c9096a357e33ccdb53863cc5c1e9f1ac663082b1d67b6c09bd0f
3
+ size 4997020040
model-00038-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30e325e2568a9624ca0e021daf5e00649401f05be3ebc5b8f14aa25681211c3a
3
+ size 5000188160
model-00039-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79cc2cc4e5dd2d9c6e7a3133b7e5591fbba3a67ad55b2d9f81876321819c73f
3
+ size 4997020168
model-00040-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec85e67beb8604011de19a9117d26cec438034896878ca480d1252fcf3cc6ad5
3
+ size 5000188032
model-00041-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e79e6f4f11d77edc36f1f571a603ec4429087e511a83ca3737fbb936cea0d73b
3
+ size 4997020304
model-00042-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29e8be30ee64cb172bb7262fe73da38bbd64618d006a2750142ccd8f189e91f4
3
+ size 5000187896
model-00043-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4e7f9ebc61351c63735e8e8dd970060712a61cf503d3c049946363dfc0c3541
3
+ size 4997020432
model-00044-of-00123.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:346ded48737b311e99588d8596f316bf3fe5a98ad295ff666cc1c60df6e6e60a
3
+ size 5000187824