Kilinskiy
/

Step-3.5-Flash-Ablitirated

+{
+  "architectures": [
+    "Step3p5ForCausalLM"
+  ],
+  "att_impl_type": "GQA",
+  "attention_other_setting": {
+    "attention_type": "sliding_attention",
+    "head_dim": 128,
+    "num_attention_groups": 8,
+    "num_attention_heads": 96,
+    "true_head_dim": 128
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_step3p5.Step3p5Config",
+    "AutoModelForCausalLM": "modeling_step3p5.Step3p5ForCausalLM"
+  },
+  "bos_token_id": 0,
+  "dtype": "bfloat16",
+  "eos_token_id": 1,
+  "head_dim": 128,
+  "hidden_size": 4096,
+  "intermediate_size": 11264,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_seq_len": 262144,
+  "model_type": "step3p5",
+  "moe_every_n_layer": 1,
+  "moe_intermediate_size": 1280,
+  "moe_layer_offset": 0,
+  "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
+  "moe_num_experts": 288,
+  "moe_router_activation": "sigmoid",
+  "moe_router_scaling_factor": 3.0,
+  "moe_top_k": 8,
+  "need_fp32_gate": true,
+  "norm_expert_weight": true,
+  "num_attention_groups": 8,
+  "num_attention_heads": 64,
+  "num_hidden_layers": 45,
+  "num_nextn_predict_layers": 3,
+  "output_hidden_states": true,
+  "pad_token_id": 1,
+  "partial_rotary_factor": 0.5,
+  "partial_rotary_factors": [
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0,
+    0.5,
+    1.0,
+    1.0,
+    1.0
+  ],
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": null,
+  "rope_theta": 5000000.0,
+  "share_expert_dim": 1280,
+  "sink": false,
+  "sliding_window": 512,
+  "swiglu_limits": [
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    7,
+    7,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "swiglu_limits_shared": [
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    16,
+    0.0,
+    0.0,
+    0.0
+  ],
+  "transformers_version": "5.1.0",
+  "use_cache": false,
+  "use_head_wise_attn_gate": true,
+  "use_moe": true,
+  "use_moe_router_bias": true,
+  "use_qk_norm": true,
+  "use_return_dict": true,
+  "use_rope_layers": [],
+  "vocab_size": 128896,
+  "yarn_only_types": [
+    "full_attention"
+  ],
+  "zero_centered": true,
+  "num_key_value_heads": 8,
+  "expert_intermediate_size": 1280
+}