SKIS-AI-Research
/

EPT-ZeRo

Text Generation

Model card Files Files and versions

Entity-27th commited on Nov 19, 2025

Commit

e46f2e8

·

1 Parent(s): 31402d0

Upload Qwen3NextForCausalLM

Files changed (1) hide show

config.json +63 -26

config.json CHANGED Viewed

@@ -1,41 +1,78 @@
 {
   "architectures": [
-    "CustomTransformerForCausalLM"
   ],
   "attention_dropout": 0.0,
   "dtype": "float32",
-  "enable_checkpointing": false,
   "gate_alpha": 0.8,
   "gate_normalize": true,
-  "gate_temperature": 1.5,
-  "gated_attention_min_tokens": 0.05,
-  "gated_attention_reduction": 16,
-  "gated_attention_threshold": 0.25,
-  "gated_delta_net_bias": true,
-  "gated_delta_net_min_tokens": 0.05,
-  "gated_delta_net_reduction": 16,
-  "gated_delta_net_threshold": 0.25,
-  "hidden_size": 2560,
-  "intermediate_size": 6144,
-  "kv_cast_dtype": "bfloat16",
-  "max_position_embeddings": 128000,
-  "model_type": "ISAC-V0",
-  "num_attention_heads": 40,
-  "num_hidden_layers": 40,
   "num_key_value_heads": 10,
   "pr_chunk_size": 2048,
   "pr_degree": 2,
-  "rms_norm_eps": 1e-05,
   "rope_scaling": {
-    "factor": 31.25,
-    "original_max_position_embeddings": 4096,
-    "type": "ntk"
   },
   "rope_theta": 10000.0,
-  "sliding_window": null,
   "transformers_version": "4.57.1",
-  "use_cache": false,
-  "use_gated_attention": true,
-  "use_gated_delta_net": true,
-  "vocab_size": 200019
 }

 {
   "architectures": [
+    "Qwen3NextForCausalLM"
   ],
+  "attention_bias": false,
   "attention_dropout": 0.0,
+  "decoder_sparse_step": 0,
   "dtype": "float32",
   "gate_alpha": 0.8,
   "gate_normalize": true,
+  "gate_temperature": 1.0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1280,
+  "initializer_range": 0.02,
+  "intermediate_size": 3328,
+  "layer_types": [
+    "linear_attention",
+    "linear_attention",
+    "linear_attention",
+    "full_attention",
+    "linear_attention",
+    "linear_attention",
+    "linear_attention",
+    "full_attention",
+    "linear_attention",
+    "linear_attention",
+    "linear_attention",
+    "full_attention",
+    "linear_attention",
+    "linear_attention",
+    "linear_attention",
+    "full_attention",
+    "linear_attention",
+    "linear_attention",
+    "linear_attention",
+    "full_attention",
+    "linear_attention",
+    "linear_attention",
+    "linear_attention",
+    "full_attention",
+    "linear_attention",
+    "linear_attention"
+  ],
+  "linear_conv_kernel_dim": 4,
+  "linear_key_head_dim": 128,
+  "linear_num_key_heads": 16,
+  "linear_num_value_heads": 32,
+  "linear_value_head_dim": 128,
+  "max_position_embeddings": 1048576,
+  "mlp_only_layers": [],
+  "model_type": "qwen3_next",
+  "moe_intermediate_size": 0,
+  "norm_topk_prob": true,
+  "num_attention_heads": 20,
+  "num_experts": 0,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 26,
   "num_key_value_heads": 10,
+  "output_router_logits": false,
+  "partial_rotary_factor": 0.25,
   "pr_chunk_size": 2048,
   "pr_degree": 2,
+  "rms_norm_eps": 1e-06,
   "rope_scaling": {
+    "factor": 32.0,
+    "original_max_position_embeddings": 32768,
+    "rope_type": "yarn"
   },
   "rope_theta": 10000.0,
+  "router_aux_loss_coef": 0.001,
+  "shared_expert_intermediate_size": 0,
+  "token_route_threshold": 0.2,
   "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_power_retention": true,
+  "vocab_size": 151936
 }