zddos commited on
Commit
fc11ad8
·
verified ·
1 Parent(s): 4bd69a0

Trained challenger (hash=3f969c4b7f1a3eb9)

Browse files
Files changed (3) hide show
  1. config.json +25 -23
  2. model.safetensors +2 -2
  3. tokenizer_config.json +1 -1
config.json CHANGED
@@ -1,20 +1,22 @@
1
  {
2
- "_sliding_window_pattern": 6,
3
  "architectures": [
4
- "Gemma3ForCausalLM"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
- "attn_logit_softcapping": null,
9
  "bos_token_id": 2,
10
  "dtype": "bfloat16",
11
- "eos_token_id": 106,
 
12
  "final_logit_softcapping": null,
 
13
  "head_dim": 256,
14
  "hidden_activation": "gelu_pytorch_tanh",
15
- "hidden_size": 1152,
 
16
  "initializer_range": 0.02,
17
- "intermediate_size": 6400,
18
  "layer_types": [
19
  "sliding_attention",
20
  "sliding_attention",
@@ -33,26 +35,24 @@
33
  "sliding_attention",
34
  "sliding_attention",
35
  "sliding_attention",
36
- "full_attention",
37
- "sliding_attention",
38
- "sliding_attention",
39
- "sliding_attention",
40
- "sliding_attention",
41
- "sliding_attention",
42
  "full_attention"
43
  ],
44
- "max_position_embeddings": 32768,
45
- "model_type": "gemma3_text",
46
- "num_attention_heads": 4,
47
- "num_hidden_layers": 24,
48
- "num_key_value_heads": 1,
 
 
 
 
49
  "pad_token_id": 0,
50
- "query_pre_attn_scalar": 256,
51
  "rms_norm_eps": 1e-06,
52
  "rope_parameters": {
53
  "full_attention": {
 
54
  "rope_theta": 1000000.0,
55
- "rope_type": "default"
56
  },
57
  "sliding_attention": {
58
  "rope_theta": 10000.0,
@@ -60,10 +60,12 @@
60
  }
61
  },
62
  "sliding_window": 512,
63
- "sliding_window_pattern": 6,
64
  "tie_word_embeddings": true,
 
65
  "transformers_version": "5.5.4",
66
- "use_bidirectional_attention": false,
67
- "use_cache": false,
68
- "vocab_size": 262144
 
 
69
  }
 
1
  {
 
2
  "architectures": [
3
+ "Gemma4ForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
+ "attention_k_eq_v": false,
8
  "bos_token_id": 2,
9
  "dtype": "bfloat16",
10
+ "enable_moe_block": false,
11
+ "eos_token_id": 1,
12
  "final_logit_softcapping": null,
13
+ "global_head_dim": 512,
14
  "head_dim": 256,
15
  "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 2304,
17
+ "hidden_size_per_layer_input": 256,
18
  "initializer_range": 0.02,
19
+ "intermediate_size": 9216,
20
  "layer_types": [
21
  "sliding_attention",
22
  "sliding_attention",
 
35
  "sliding_attention",
36
  "sliding_attention",
37
  "sliding_attention",
 
 
 
 
 
 
38
  "full_attention"
39
  ],
40
+ "max_position_embeddings": 131072,
41
+ "model_type": "gemma4_text",
42
+ "moe_intermediate_size": null,
43
+ "num_attention_heads": 8,
44
+ "num_experts": null,
45
+ "num_global_key_value_heads": null,
46
+ "num_hidden_layers": 18,
47
+ "num_key_value_heads": 4,
48
+ "num_kv_shared_layers": 0,
49
  "pad_token_id": 0,
 
50
  "rms_norm_eps": 1e-06,
51
  "rope_parameters": {
52
  "full_attention": {
53
+ "partial_rotary_factor": 0.25,
54
  "rope_theta": 1000000.0,
55
+ "rope_type": "proportional"
56
  },
57
  "sliding_attention": {
58
  "rope_theta": 10000.0,
 
60
  }
61
  },
62
  "sliding_window": 512,
 
63
  "tie_word_embeddings": true,
64
+ "top_k_experts": null,
65
  "transformers_version": "5.5.4",
66
+ "use_bidirectional_attention": null,
67
+ "use_cache": true,
68
+ "use_double_wide_mlp": false,
69
+ "vocab_size": 262144,
70
+ "vocab_size_per_layer_input": 262144
71
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54db4aaa5be5b7013480f0b3bb62a51cd80e6cc1d08f7e29151badc86c08d54d
3
- size 1807505032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f969c4b7f1a3eb9c231c2752cceea47e0ce74c18a5314be75fb8e314356745c
3
+ size 6576171052
tokenizer_config.json CHANGED
@@ -6,7 +6,7 @@
6
  "eoi_token": "<end_of_image>",
7
  "eos_token": "<end_of_turn>",
8
  "image_token": "<image_soft_token>",
9
- "is_local": false,
10
  "mask_token": "<mask>",
11
  "model_max_length": 32768,
12
  "model_specific_special_tokens": {
 
6
  "eoi_token": "<end_of_image>",
7
  "eos_token": "<end_of_turn>",
8
  "image_token": "<image_soft_token>",
9
+ "is_local": true,
10
  "mask_token": "<mask>",
11
  "model_max_length": 32768,
12
  "model_specific_special_tokens": {