Trained challenger (hash=3f969c4b7f1a3eb9)

Browse files

Files changed (3) hide show

config.json +25 -23
model.safetensors +2 -2
tokenizer_config.json +1 -1

config.json CHANGED Viewed

@@ -1,20 +1,22 @@
 {
-  "_sliding_window_pattern": 6,
   "architectures": [
-    "Gemma3ForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
-  "attn_logit_softcapping": null,
   "bos_token_id": 2,
   "dtype": "bfloat16",
-  "eos_token_id": 106,
   "final_logit_softcapping": null,
   "head_dim": 256,
   "hidden_activation": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
   "initializer_range": 0.02,
-  "intermediate_size": 6400,
   "layer_types": [
     "sliding_attention",
     "sliding_attention",
@@ -33,26 +35,24 @@
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
     "full_attention"
   ],
-  "max_position_embeddings": 32768,
-  "model_type": "gemma3_text",
-  "num_attention_heads": 4,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 1,
   "pad_token_id": 0,
-  "query_pre_attn_scalar": 256,
   "rms_norm_eps": 1e-06,
   "rope_parameters": {
     "full_attention": {
       "rope_theta": 1000000.0,
-      "rope_type": "default"
     },
     "sliding_attention": {
       "rope_theta": 10000.0,
@@ -60,10 +60,12 @@
     }
   },
   "sliding_window": 512,
-  "sliding_window_pattern": 6,
   "tie_word_embeddings": true,
   "transformers_version": "5.5.4",
-  "use_bidirectional_attention": false,
-  "use_cache": false,
-  "vocab_size": 262144
 }

 {
   "architectures": [
+    "Gemma4ForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
+  "attention_k_eq_v": false,
   "bos_token_id": 2,
   "dtype": "bfloat16",
+  "enable_moe_block": false,
+  "eos_token_id": 1,
   "final_logit_softcapping": null,
+  "global_head_dim": 512,
   "head_dim": 256,
   "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2304,
+  "hidden_size_per_layer_input": 256,
   "initializer_range": 0.02,
+  "intermediate_size": 9216,
   "layer_types": [
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
     "full_attention"
   ],
+  "max_position_embeddings": 131072,
+  "model_type": "gemma4_text",
+  "moe_intermediate_size": null,
+  "num_attention_heads": 8,
+  "num_experts": null,
+  "num_global_key_value_heads": null,
+  "num_hidden_layers": 18,
+  "num_key_value_heads": 4,
+  "num_kv_shared_layers": 0,
   "pad_token_id": 0,
   "rms_norm_eps": 1e-06,
   "rope_parameters": {
     "full_attention": {
+      "partial_rotary_factor": 0.25,
       "rope_theta": 1000000.0,
+      "rope_type": "proportional"
     },
     "sliding_attention": {
       "rope_theta": 10000.0,
     }
   },
   "sliding_window": 512,
   "tie_word_embeddings": true,
+  "top_k_experts": null,
   "transformers_version": "5.5.4",
+  "use_bidirectional_attention": null,
+  "use_cache": true,
+  "use_double_wide_mlp": false,
+  "vocab_size": 262144,
+  "vocab_size_per_layer_input": 262144
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54db4aaa5be5b7013480f0b3bb62a51cd80e6cc1d08f7e29151badc86c08d54d
-size 1807505032

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f969c4b7f1a3eb9c231c2752cceea47e0ce74c18a5314be75fb8e314356745c
+size 6576171052

tokenizer_config.json CHANGED Viewed

@@ -6,7 +6,7 @@
   "eoi_token": "<end_of_image>",
   "eos_token": "<end_of_turn>",
   "image_token": "<image_soft_token>",
-  "is_local": false,
   "mask_token": "<mask>",
   "model_max_length": 32768,
   "model_specific_special_tokens": {

   "eoi_token": "<end_of_image>",
   "eos_token": "<end_of_turn>",
   "image_token": "<image_soft_token>",
+  "is_local": true,
   "mask_token": "<mask>",
   "model_max_length": 32768,
   "model_specific_special_tokens": {