Maxtimer97 commited on
Commit
48af17f
·
verified ·
1 Parent(s): 52551dd

Upload HymbaForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +22 -22
  2. generation_config.json +2 -1
  3. model.safetensors +2 -2
config.json CHANGED
@@ -4,7 +4,7 @@
4
  ],
5
  "attention_dropout": 0.0,
6
  "attn_factor": 0.5,
7
- "attn_hidden_size": -1,
8
  "attn_implementation": "flash_attention_2",
9
  "attn_implementation_new": "flash_attention_2",
10
  "attn_only_wo_proj": true,
@@ -15,19 +15,19 @@
15
  "bos_token_id": 128000,
16
  "calc_logits_for_entire_prompt": false,
17
  "conv_dim": {
18
- "0": 3200,
19
- "1": 3200,
20
- "10": 3200,
21
- "11": 3200,
22
- "12": 3200,
23
- "13": 3200,
24
- "14": 3200,
25
- "15": 3200,
26
  "16": 3200,
27
  "17": 3200,
28
  "18": 3200,
29
  "19": 3200,
30
- "2": 3200,
31
  "20": 3200,
32
  "21": 3200,
33
  "22": 3200,
@@ -38,15 +38,15 @@
38
  "27": 3200,
39
  "28": 3200,
40
  "29": 3200,
41
- "3": 3200,
42
  "30": 3200,
43
  "31": 3200,
44
- "4": 3200,
45
- "5": 3200,
46
- "6": 3200,
47
- "7": 3200,
48
- "8": 3200,
49
- "9": 3200
50
  },
51
  "eos_token_id": [
52
  128001,
@@ -58,7 +58,7 @@
58
  8,
59
  15
60
  ],
61
- "head_dim": 128,
62
  "hidden_act": "silu",
63
  "hidden_size": 2048,
64
  "initializer_range": 0.02,
@@ -107,7 +107,7 @@
107
  "orig_max_position_embeddings": 2048,
108
  "output_router_logits": false,
109
  "pad_token_id": null,
110
- "pure_attn": true,
111
  "rms_norm_eps": 1e-05,
112
  "rope": true,
113
  "rope_scaling": {
@@ -121,12 +121,12 @@
121
  "rope_type": "ntk",
122
  "router_aux_loss_coef": 0.001,
123
  "seq_length": 8192,
124
- "sliding_window": 256,
125
  "tie_word_embeddings": true,
126
  "torch_dtype": "bfloat16",
127
  "transformers_version": "4.53.0",
128
- "use_cache": true,
129
  "use_mamba_kernels": true,
130
- "v_head_dim": 128,
131
  "vocab_size": 128256
132
  }
 
4
  ],
5
  "attention_dropout": 0.0,
6
  "attn_factor": 0.5,
7
+ "attn_hidden_size": 1024,
8
  "attn_implementation": "flash_attention_2",
9
  "attn_implementation_new": "flash_attention_2",
10
  "attn_only_wo_proj": true,
 
15
  "bos_token_id": 128000,
16
  "calc_logits_for_entire_prompt": false,
17
  "conv_dim": {
18
+ "0": 256,
19
+ "1": 256,
20
+ "10": 256,
21
+ "11": 256,
22
+ "12": 256,
23
+ "13": 256,
24
+ "14": 256,
25
+ "15": 256,
26
  "16": 3200,
27
  "17": 3200,
28
  "18": 3200,
29
  "19": 3200,
30
+ "2": 256,
31
  "20": 3200,
32
  "21": 3200,
33
  "22": 3200,
 
38
  "27": 3200,
39
  "28": 3200,
40
  "29": 3200,
41
+ "3": 256,
42
  "30": 3200,
43
  "31": 3200,
44
+ "4": 256,
45
+ "5": 256,
46
+ "6": 256,
47
+ "7": 256,
48
+ "8": 256,
49
+ "9": 256
50
  },
51
  "eos_token_id": [
52
  128001,
 
58
  8,
59
  15
60
  ],
61
+ "head_dim": 64,
62
  "hidden_act": "silu",
63
  "hidden_size": 2048,
64
  "initializer_range": 0.02,
 
107
  "orig_max_position_embeddings": 2048,
108
  "output_router_logits": false,
109
  "pad_token_id": null,
110
+ "pure_attn": false,
111
  "rms_norm_eps": 1e-05,
112
  "rope": true,
113
  "rope_scaling": {
 
121
  "rope_type": "ntk",
122
  "router_aux_loss_coef": 0.001,
123
  "seq_length": 8192,
124
+ "sliding_window": 1024,
125
  "tie_word_embeddings": true,
126
  "torch_dtype": "bfloat16",
127
  "transformers_version": "4.53.0",
128
+ "use_cache": false,
129
  "use_mamba_kernels": true,
130
+ "v_head_dim": 64,
131
  "vocab_size": 128256
132
  }
generation_config.json CHANGED
@@ -6,5 +6,6 @@
6
  128008,
7
  128009
8
  ],
9
- "transformers_version": "4.53.0"
 
10
  }
 
6
  128008,
7
  128009
8
  ],
9
+ "transformers_version": "4.53.0",
10
+ "use_cache": false
11
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8059870eeb7862b85cd13ba766340d59fcd331bd3c3b9fba8e827fe243e1463e
3
- size 2471642200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af241d8ca1c5bbc2b53d02935fa490c3a3bfeb6a06153da202ba212974c0a556
3
+ size 2549222456