aspctu commited on
Commit
bfdcc64
·
verified ·
1 Parent(s): 55f3e55

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +208 -246
config.json CHANGED
@@ -1,252 +1,214 @@
1
  {
2
- "architectures": [
3
- "KimiK25ForConditionalGeneration"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_kimi_k25.KimiK25Config",
7
- "AutoModel": "modeling_kimi_k25.KimiK25ForConditionalGeneration",
8
- "AutoModelForCausalLM": "modeling_kimi_k25.KimiK25ForConditionalGeneration"
9
- },
10
- "bos_token_id": 163584,
11
- "dtype": "bfloat16",
12
- "eos_token_id": 163585,
13
- "ignore_index": -100,
14
- "media_placeholder_token_id": 163605,
15
- "model_type": "kimi_k25",
16
- "pad_token_id": 163839,
17
- "text_config": {
18
- "_name_or_path": "",
19
- "add_cross_attention": false,
20
- "architectures": [
21
- "DeepseekV3ForCausalLM"
22
- ],
23
- "attention_bias": false,
24
- "attention_dropout": 0.0,
25
- "auto_map": {
26
- "AutoConfig": "configuration_deepseek.DeepseekV3Config",
27
- "AutoModel": "modeling_deepseek.DeepseekV3Model",
28
- "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
29
- },
30
- "aux_loss_alpha": 0.001,
31
- "bad_words_ids": null,
32
- "begin_suppress_tokens": null,
33
- "bos_token_id": 163584,
34
- "chunk_size_feed_forward": 0,
35
- "cross_attention_hidden_size": null,
36
- "decoder_start_token_id": null,
37
- "diversity_penalty": 0.0,
38
- "do_sample": false,
39
- "dtype": "bfloat16",
40
- "early_stopping": false,
41
- "encoder_no_repeat_ngram_size": 0,
42
- "eos_token_id": 163585,
43
- "ep_size": 1,
44
- "exponential_decay_length_penalty": null,
45
- "finetuning_task": null,
46
- "first_k_dense_replace": 1,
47
- "forced_bos_token_id": null,
48
- "forced_eos_token_id": null,
49
- "hidden_act": "silu",
50
- "hidden_size": 7168,
51
- "id2label": {
52
- "0": "LABEL_0",
53
- "1": "LABEL_1"
54
- },
55
- "initializer_range": 0.02,
56
- "intermediate_size": 18432,
57
- "is_decoder": false,
58
- "is_encoder_decoder": false,
59
- "kv_lora_rank": 512,
60
- "label2id": {
61
- "LABEL_0": 0,
62
- "LABEL_1": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  },
64
- "length_penalty": 1.0,
65
- "max_length": 20,
66
- "max_position_embeddings": 262144,
67
- "min_length": 0,
68
- "model_type": "deepseek_v3",
69
- "moe_intermediate_size": 2048,
70
- "moe_layer_freq": 1,
71
- "n_group": 1,
72
- "n_routed_experts": 384,
73
- "n_shared_experts": 1,
74
- "no_repeat_ngram_size": 0,
75
- "norm_topk_prob": true,
76
- "num_attention_heads": 64,
77
- "num_beam_groups": 1,
78
- "num_beams": 1,
79
- "num_experts_per_tok": 8,
80
- "num_hidden_layers": 61,
81
- "num_key_value_heads": 64,
82
- "num_nextn_predict_layers": 0,
83
- "num_return_sequences": 1,
84
- "output_attentions": false,
85
- "output_hidden_states": false,
86
- "output_scores": false,
87
- "pad_token_id": 163839,
88
- "prefix": null,
89
- "pretraining_tp": 1,
90
- "problem_type": null,
91
- "pruned_heads": {},
92
- "q_lora_rank": 1536,
93
- "qk_nope_head_dim": 128,
94
- "qk_rope_head_dim": 64,
95
- "remove_invalid_values": false,
96
- "repetition_penalty": 1.0,
97
- "return_dict": true,
98
- "return_dict_in_generate": false,
99
- "rms_norm_eps": 1e-05,
100
- "rope_scaling": {
101
- "beta_fast": 32.0,
102
- "beta_slow": 1.0,
103
- "factor": 64.0,
104
- "mscale": 1.0,
105
- "mscale_all_dim": 1.0,
106
- "original_max_position_embeddings": 4096,
107
- "type": "yarn"
108
  },
109
- "rope_theta": 50000.0,
110
- "routed_scaling_factor": 2.827,
111
- "scoring_func": "sigmoid",
112
- "sep_token_id": null,
113
- "seq_aux": true,
114
- "suppress_tokens": null,
115
- "task_specific_params": null,
116
- "temperature": 1.0,
117
- "tf_legacy_loss": false,
118
- "tie_encoder_decoder": false,
119
- "tie_word_embeddings": false,
120
- "tokenizer_class": null,
121
- "top_k": 50,
122
- "top_p": 1.0,
123
- "topk_group": 1,
124
- "topk_method": "noaux_tc",
125
- "torchscript": false,
126
- "typical_p": 1.0,
127
- "use_bfloat16": false,
128
- "use_cache": true,
129
- "v_head_dim": 128,
130
- "vocab_size": 163840
131
  },
132
- "tie_word_embeddings": false,
133
- "transformers_version": "4.57.1",
134
- "use_unified_vision_chunk": true,
135
- "video_placeholder": "<|kimi_k25_video_placeholder|>",
136
- "vision_config": {
137
- "init_pos_emb_height": 64,
138
- "init_pos_emb_time": 4,
139
- "init_pos_emb_width": 64,
140
- "merge_kernel_size": [
141
- 2,
142
- 2
143
- ],
144
- "merge_type": "sd2_tpool",
145
- "mm_hidden_size": 1152,
146
- "mm_projector_type": "patchmerger",
147
- "model_type": "",
148
- "patch_size": 14,
149
- "pos_emb_type": "divided_fixed",
150
- "projector_hidden_act": "gelu",
151
- "projector_ln_eps": 1e-05,
152
- "text_hidden_size": 7168,
153
- "video_attn_type": "spatial_temporal",
154
- "vt_hidden_size": 1152,
155
- "vt_intermediate_size": 4304,
156
- "vt_num_attention_heads": 16,
157
- "vt_num_hidden_layers": 27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  },
159
- "quantization_config": {
160
- "config_groups": {
161
- "group_0": {
162
- "input_activations": {
163
- "dynamic": false,
164
- "num_bits": 4,
165
- "type": "float",
166
- "group_size": 16
167
- },
168
- "weights": {
169
- "dynamic": false,
170
- "num_bits": 4,
171
- "type": "float",
172
- "group_size": 16
173
- },
174
- "targets": [
175
- "Linear"
176
- ]
177
- }
178
- },
179
- "ignore": [
180
- "language_model.lm_head",
181
- "language_model.model.layers.0.self_attn*",
182
- "language_model.model.layers.1.self_attn*",
183
- "language_model.model.layers.10.self_attn*",
184
- "language_model.model.layers.11.self_attn*",
185
- "language_model.model.layers.12.self_attn*",
186
- "language_model.model.layers.13.self_attn*",
187
- "language_model.model.layers.14.self_attn*",
188
- "language_model.model.layers.15.self_attn*",
189
- "language_model.model.layers.16.self_attn*",
190
- "language_model.model.layers.17.self_attn*",
191
- "language_model.model.layers.18.self_attn*",
192
- "language_model.model.layers.19.self_attn*",
193
- "language_model.model.layers.2.self_attn*",
194
- "language_model.model.layers.20.self_attn*",
195
- "language_model.model.layers.21.self_attn*",
196
- "language_model.model.layers.22.self_attn*",
197
- "language_model.model.layers.23.self_attn*",
198
- "language_model.model.layers.24.self_attn*",
199
- "language_model.model.layers.25.self_attn*",
200
- "language_model.model.layers.26.self_attn*",
201
- "language_model.model.layers.27.self_attn*",
202
- "language_model.model.layers.28.self_attn*",
203
- "language_model.model.layers.29.self_attn*",
204
- "language_model.model.layers.3.self_attn*",
205
- "language_model.model.layers.30.self_attn*",
206
- "language_model.model.layers.31.self_attn*",
207
- "language_model.model.layers.32.self_attn*",
208
- "language_model.model.layers.33.self_attn*",
209
- "language_model.model.layers.34.self_attn*",
210
- "language_model.model.layers.35.self_attn*",
211
- "language_model.model.layers.36.self_attn*",
212
- "language_model.model.layers.37.self_attn*",
213
- "language_model.model.layers.38.self_attn*",
214
- "language_model.model.layers.39.self_attn*",
215
- "language_model.model.layers.4.self_attn*",
216
- "language_model.model.layers.40.self_attn*",
217
- "language_model.model.layers.41.self_attn*",
218
- "language_model.model.layers.42.self_attn*",
219
- "language_model.model.layers.43.self_attn*",
220
- "language_model.model.layers.44.self_attn*",
221
- "language_model.model.layers.45.self_attn*",
222
- "language_model.model.layers.46.self_attn*",
223
- "language_model.model.layers.47.self_attn*",
224
- "language_model.model.layers.48.self_attn*",
225
- "language_model.model.layers.49.self_attn*",
226
- "language_model.model.layers.5.self_attn*",
227
- "language_model.model.layers.50.self_attn*",
228
- "language_model.model.layers.51.self_attn*",
229
- "language_model.model.layers.52.self_attn*",
230
- "language_model.model.layers.53.self_attn*",
231
- "language_model.model.layers.54.self_attn*",
232
- "language_model.model.layers.55.self_attn*",
233
- "language_model.model.layers.56.self_attn*",
234
- "language_model.model.layers.57.self_attn*",
235
- "language_model.model.layers.58.self_attn*",
236
- "language_model.model.layers.59.self_attn*",
237
- "language_model.model.layers.6.self_attn*",
238
- "language_model.model.layers.60.self_attn*",
239
- "language_model.model.layers.7.self_attn*",
240
- "language_model.model.layers.8.self_attn*",
241
- "language_model.model.layers.9.self_attn*",
242
- "mm_projector*",
243
- "vision_tower*"
244
- ],
245
- "quant_algo": "NVFP4",
246
- "producer": {
247
- "name": "modelopt",
248
- "version": "0.41.0rc2.dev54+g99912fbdf"
249
- },
250
- "quant_method": "modelopt"
251
- }
252
  }
 
1
  {
2
+ "_name_or_path": "",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "DeepseekV3ForCausalLM"
6
+ ],
7
+ "attention_bias": false,
8
+ "attention_dropout": 0,
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
11
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
12
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
13
+ },
14
+ "aux_loss_alpha": 0.001,
15
+ "bad_words_ids": null,
16
+ "begin_suppress_tokens": null,
17
+ "bos_token_id": 163584,
18
+ "chunk_size_feed_forward": 0,
19
+ "cross_attention_hidden_size": null,
20
+ "decoder_start_token_id": null,
21
+ "diversity_penalty": 0,
22
+ "do_sample": false,
23
+ "dtype": "bfloat16",
24
+ "early_stopping": false,
25
+ "encoder_no_repeat_ngram_size": 0,
26
+ "eos_token_id": 163585,
27
+ "ep_size": 1,
28
+ "exponential_decay_length_penalty": null,
29
+ "finetuning_task": null,
30
+ "first_k_dense_replace": 1,
31
+ "forced_bos_token_id": null,
32
+ "forced_eos_token_id": null,
33
+ "hidden_act": "silu",
34
+ "hidden_size": 7168,
35
+ "id2label": {
36
+ "0": "LABEL_0",
37
+ "1": "LABEL_1"
38
+ },
39
+ "initializer_range": 0.02,
40
+ "intermediate_size": 18432,
41
+ "is_decoder": false,
42
+ "is_encoder_decoder": false,
43
+ "kv_lora_rank": 512,
44
+ "label2id": {
45
+ "LABEL_0": 0,
46
+ "LABEL_1": 1
47
+ },
48
+ "length_penalty": 1,
49
+ "max_length": 20,
50
+ "max_position_embeddings": 262144,
51
+ "min_length": 0,
52
+ "model_type": "deepseek_v3",
53
+ "moe_intermediate_size": 2048,
54
+ "moe_layer_freq": 1,
55
+ "n_group": 1,
56
+ "n_routed_experts": 384,
57
+ "n_shared_experts": 1,
58
+ "no_repeat_ngram_size": 0,
59
+ "norm_topk_prob": true,
60
+ "num_attention_heads": 64,
61
+ "num_beam_groups": 1,
62
+ "num_beams": 1,
63
+ "num_experts_per_tok": 8,
64
+ "num_hidden_layers": 61,
65
+ "num_key_value_heads": 64,
66
+ "num_nextn_predict_layers": 0,
67
+ "num_return_sequences": 1,
68
+ "output_attentions": false,
69
+ "output_hidden_states": false,
70
+ "output_scores": false,
71
+ "pad_token_id": 163839,
72
+ "prefix": null,
73
+ "pretraining_tp": 1,
74
+ "problem_type": null,
75
+ "pruned_heads": {},
76
+ "q_lora_rank": 1536,
77
+ "qk_nope_head_dim": 128,
78
+ "qk_rope_head_dim": 64,
79
+ "quantization_config": {
80
+ "config_groups": {
81
+ "group_0": {
82
+ "input_activations": {
83
+ "dynamic": false,
84
+ "num_bits": 4,
85
+ "type": "float",
86
+ "group_size": 16
87
  },
88
+ "weights": {
89
+ "dynamic": false,
90
+ "num_bits": 4,
91
+ "type": "float",
92
+ "group_size": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  },
94
+ "targets": [
95
+ "Linear"
96
+ ]
97
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  },
99
+ "ignore": [
100
+ "lm_head",
101
+ "model.layers.0.self_attn*",
102
+ "model.layers.1.self_attn*",
103
+ "model.layers.10.self_attn*",
104
+ "model.layers.11.self_attn*",
105
+ "model.layers.12.self_attn*",
106
+ "model.layers.13.self_attn*",
107
+ "model.layers.14.self_attn*",
108
+ "model.layers.15.self_attn*",
109
+ "model.layers.16.self_attn*",
110
+ "model.layers.17.self_attn*",
111
+ "model.layers.18.self_attn*",
112
+ "model.layers.19.self_attn*",
113
+ "model.layers.2.self_attn*",
114
+ "model.layers.20.self_attn*",
115
+ "model.layers.21.self_attn*",
116
+ "model.layers.22.self_attn*",
117
+ "model.layers.23.self_attn*",
118
+ "model.layers.24.self_attn*",
119
+ "model.layers.25.self_attn*",
120
+ "model.layers.26.self_attn*",
121
+ "model.layers.27.self_attn*",
122
+ "model.layers.28.self_attn*",
123
+ "model.layers.29.self_attn*",
124
+ "model.layers.3.self_attn*",
125
+ "model.layers.30.self_attn*",
126
+ "model.layers.31.self_attn*",
127
+ "model.layers.32.self_attn*",
128
+ "model.layers.33.self_attn*",
129
+ "model.layers.34.self_attn*",
130
+ "model.layers.35.self_attn*",
131
+ "model.layers.36.self_attn*",
132
+ "model.layers.37.self_attn*",
133
+ "model.layers.38.self_attn*",
134
+ "model.layers.39.self_attn*",
135
+ "model.layers.4.self_attn*",
136
+ "model.layers.40.self_attn*",
137
+ "model.layers.41.self_attn*",
138
+ "model.layers.42.self_attn*",
139
+ "model.layers.43.self_attn*",
140
+ "model.layers.44.self_attn*",
141
+ "model.layers.45.self_attn*",
142
+ "model.layers.46.self_attn*",
143
+ "model.layers.47.self_attn*",
144
+ "model.layers.48.self_attn*",
145
+ "model.layers.49.self_attn*",
146
+ "model.layers.5.self_attn*",
147
+ "model.layers.50.self_attn*",
148
+ "model.layers.51.self_attn*",
149
+ "model.layers.52.self_attn*",
150
+ "model.layers.53.self_attn*",
151
+ "model.layers.54.self_attn*",
152
+ "model.layers.55.self_attn*",
153
+ "model.layers.56.self_attn*",
154
+ "model.layers.57.self_attn*",
155
+ "model.layers.58.self_attn*",
156
+ "model.layers.59.self_attn*",
157
+ "model.layers.6.self_attn*",
158
+ "model.layers.60.self_attn*",
159
+ "model.layers.7.self_attn*",
160
+ "model.layers.8.self_attn*",
161
+ "model.layers.9.self_attn*",
162
+ "mm_projector*",
163
+ "vision_tower*"
164
+ ],
165
+ "quant_algo": "NVFP4",
166
+ "kv_cache_scheme": {
167
+ "dynamic": false,
168
+ "num_bits": 8,
169
+ "type": "float"
170
  },
171
+ "producer": {
172
+ "name": "modelopt",
173
+ "version": "0.41.0rc2.dev54+g99912fbdf"
174
+ },
175
+ "quant_method": "modelopt"
176
+ },
177
+ "remove_invalid_values": false,
178
+ "repetition_penalty": 1,
179
+ "return_dict": true,
180
+ "return_dict_in_generate": false,
181
+ "rms_norm_eps": 0.00001,
182
+ "rope_scaling": {
183
+ "beta_fast": 32,
184
+ "beta_slow": 1,
185
+ "factor": 64,
186
+ "mscale": 1,
187
+ "mscale_all_dim": 1,
188
+ "original_max_position_embeddings": 4096,
189
+ "type": "yarn"
190
+ },
191
+ "rope_theta": 50000,
192
+ "routed_scaling_factor": 2.827,
193
+ "scoring_func": "sigmoid",
194
+ "sep_token_id": null,
195
+ "seq_aux": true,
196
+ "suppress_tokens": null,
197
+ "task_specific_params": null,
198
+ "temperature": 1,
199
+ "tf_legacy_loss": false,
200
+ "tie_encoder_decoder": false,
201
+ "tie_word_embeddings": false,
202
+ "tokenizer_class": null,
203
+ "top_k": 50,
204
+ "top_p": 1,
205
+ "topk_group": 1,
206
+ "topk_method": "noaux_tc",
207
+ "torchscript": false,
208
+ "transformers_version": "4.56.2",
209
+ "typical_p": 1,
210
+ "use_bfloat16": false,
211
+ "use_cache": true,
212
+ "v_head_dim": 128,
213
+ "vocab_size": 163840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  }