Text Generation
Transformers
Safetensors
step3p5
custom_code
Files changed (1) hide show
  1. config.json +10 -22
config.json CHANGED
@@ -3,17 +3,22 @@
3
  "Step3p5ForCausalLM"
4
  ],
5
  "model_type": "step3p5",
 
 
 
 
6
  "hidden_size": 4096,
7
  "intermediate_size": 11264,
8
  "num_hidden_layers": 45,
9
- "max_seq_len": 262144,
 
10
  "vocab_size": 128815,
11
  "torch_dtype": "bfloat16",
12
- "use_qk_norm": false,
13
  "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
14
- "use_mfa": false,
15
  "num_attention_heads": 64,
16
  "num_attention_groups": 8,
 
17
  "head_dim": 128,
18
  "use_moe": true,
19
  "moe_num_experts": 288,
@@ -72,8 +77,6 @@
72
  10000.0,
73
  10000.0,
74
  1000000.0,
75
- 10000.0,
76
- 10000.0,
77
  10000.0
78
  ],
79
  "use_head_wise_attn_gate": true,
@@ -127,12 +130,9 @@
127
  "sliding_attention",
128
  "sliding_attention",
129
  "full_attention",
130
- "sliding_attention",
131
- "sliding_attention",
132
  "sliding_attention"
133
  ],
134
  "use_rope_layers": [],
135
- "num_nextn_predict_layers": 3,
136
  "partial_rotary_factors": [
137
  0.5,
138
  1.0,
@@ -179,14 +179,11 @@
179
  1.0,
180
  1.0,
181
  0.5,
182
- 1.0,
183
- 1.0,
184
  1.0
185
  ],
186
  "eos_token_id": [
187
  1,
188
- 2,
189
- 128007
190
  ],
191
  "bos_token_id": 0,
192
  "attention_other_setting": {
@@ -242,8 +239,6 @@
242
  0.0,
243
  7,
244
  7,
245
- 0.0,
246
- 0.0,
247
  0.0
248
  ],
249
  "swiglu_limits_shared": [
@@ -292,14 +287,7 @@
292
  0.0,
293
  0.0,
294
  16,
295
- 0.0,
296
- 0.0,
297
  0.0
298
  ],
299
- "zero_centered": true,
300
- "max_position_embeddings": 262144,
301
- "yarn_only_types": [
302
- "full_attention"
303
- ],
304
- "model_max_lengths": 262144
305
  }
 
3
  "Step3p5ForCausalLM"
4
  ],
5
  "model_type": "step3p5",
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_step3p5.Step3p5Config",
8
+ "AutoModelForCausalLM": "modeling_step3p5.Step3p5ForCausalLM"
9
+ },
10
  "hidden_size": 4096,
11
  "intermediate_size": 11264,
12
  "num_hidden_layers": 45,
13
+ "max_seq_len": 32768,
14
+ "max_position_embedding": 32768,
15
  "vocab_size": 128815,
16
  "torch_dtype": "bfloat16",
17
+ "use_qk_norm": true,
18
  "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
 
19
  "num_attention_heads": 64,
20
  "num_attention_groups": 8,
21
+ "num_nextn_predict_layers": 1,
22
  "head_dim": 128,
23
  "use_moe": true,
24
  "moe_num_experts": 288,
 
77
  10000.0,
78
  10000.0,
79
  1000000.0,
 
 
80
  10000.0
81
  ],
82
  "use_head_wise_attn_gate": true,
 
130
  "sliding_attention",
131
  "sliding_attention",
132
  "full_attention",
 
 
133
  "sliding_attention"
134
  ],
135
  "use_rope_layers": [],
 
136
  "partial_rotary_factors": [
137
  0.5,
138
  1.0,
 
179
  1.0,
180
  1.0,
181
  0.5,
 
 
182
  1.0
183
  ],
184
  "eos_token_id": [
185
  1,
186
+ 2
 
187
  ],
188
  "bos_token_id": 0,
189
  "attention_other_setting": {
 
239
  0.0,
240
  7,
241
  7,
 
 
242
  0.0
243
  ],
244
  "swiglu_limits_shared": [
 
287
  0.0,
288
  0.0,
289
  16,
 
 
290
  0.0
291
  ],
292
+ "zero_centered": true
 
 
 
 
 
293
  }