EpistemeAI
/

gpt-oss-20b-stem-4bit

@@ -1,107 +1,69 @@
 {
-  "architectures": [
-    "GptOssForCausalLM"
-  ],
-  "attention_bias": true,
-  "attention_dropout": 0.0,
-  "eos_token_id": 200002,
-  "experts_per_token": 4,
-  "head_dim": 64,
-  "hidden_act": "silu",
-  "hidden_size": 2880,
-  "initial_context_length": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 2880,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 131072,
-  "model_type": "gpt_oss",
-  "num_attention_heads": 64,
-  "num_experts_per_tok": 4,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 8,
-  "num_local_experts": 32,
-  "output_router_logits": false,
-  "pad_token_id": 199999,
-  "quantization_config": {
-    "bnb_4bit_compute_dtype": "float16",
-    "bnb_4bit_quant_type": "nf4",
-    "bnb_4bit_use_double_quant": true,
-    "llm_int8_enable_fp32_cpu_offload": false,
-    "llm_int8_has_fp16_weight": false,
-    "llm_int8_skip_modules": [
-      "model.layers.0.mlp.router.linear",
-      "model.layers.1.mlp.router.linear",
-      "model.layers.2.mlp.router.linear",
-      "model.layers.3.mlp.router.linear",
-      "model.layers.4.mlp.router.linear",
-      "model.layers.5.mlp.router.linear",
-      "model.layers.6.mlp.router.linear",
-      "model.layers.7.mlp.router.linear",
-      "model.layers.8.mlp.router.linear",
-      "model.layers.9.mlp.router.linear",
-      "model.layers.10.mlp.router.linear",
-      "model.layers.11.mlp.router.linear",
-      "model.layers.12.mlp.router.linear",
-      "model.layers.13.mlp.router.linear",
-      "model.layers.14.mlp.router.linear",
-      "model.layers.15.mlp.router.linear",
-      "model.layers.16.mlp.router.linear",
-      "model.layers.17.mlp.router.linear",
-      "model.layers.18.mlp.router.linear",
-      "model.layers.19.mlp.router.linear",
-      "model.layers.20.mlp.router.linear",
-      "model.layers.21.mlp.router.linear",
-      "model.layers.22.mlp.router.linear",
-      "model.layers.23.mlp.router.linear",
-      "lm_head"
     ],
-    "llm_int8_threshold": 6.0,
-    "load_in_4bit": true,
-    "load_in_8bit": false,
-    "quant_method": "bitsandbytes"
-  },
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": {
-    "beta_fast": 32.0,
-    "beta_slow": 1.0,
-    "factor": 32.0,
-    "original_max_position_embeddings": 4096,
-    "rope_type": "yarn",
-    "truncate": false
-  },
-  "rope_theta": 150000,
-  "router_aux_loss_coef": 0.9,
-  "sliding_window": 128,
-  "swiglu_limit": 7.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.55.4",
-  "unsloth_version": "2025.9.5",
-  "use_cache": true,
-  "vocab_size": 201088
-}

 {
+    "architectures": [
+        "GptOssForCausalLM"
     ],
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "eos_token_id": 200002,
+    "experts_per_token": 4,
+    "head_dim": 64,
+    "hidden_act": "silu",
+    "hidden_size": 2880,
+    "initial_context_length": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 2880,
+    "layer_types": [
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gpt_oss",
+    "num_attention_heads": 64,
+    "num_experts_per_tok": 4,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 8,
+    "num_local_experts": 32,
+    "output_router_logits": false,
+    "pad_token_id": 199999,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "beta_fast": 32.0,
+        "beta_slow": 1.0,
+        "factor": 32.0,
+        "original_max_position_embeddings": 4096,
+        "rope_type": "yarn",
+        "truncate": false
+    },
+    "rope_theta": 150000,
+    "router_aux_loss_coef": 0.9,
+    "sliding_window": 128,
+    "swiglu_limit": 7.0,
+    "tie_word_embeddings": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.55.4",
+    "unsloth_version": "2025.9.5",
+    "use_cache": true,
+    "vocab_size": 201088
+}