Transformers
PyTorch
English
simarora commited on
Commit
d7359c4
·
verified ·
1 Parent(s): 1399e2e

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +36 -6
config.json CHANGED
@@ -1,7 +1,37 @@
1
  {
2
- "d_model": 1808,
3
- "n_head": 16,
4
- "n_layer": 38,
5
- "residual_in_fp32": true,
6
- "pad_vocab_size_multiple": 8
7
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "n_embd": 1792,
3
+ "n_inner": 3584,
4
+ "n_head": 16,
5
+ "n_layer": 36,
6
+
7
+ "activation_function": "swiglu",
8
+ "resid_pdrop": 0.0,
9
+ "residual_in_fp32": True,
10
+ "pad_vocab_size_multiple": 8,
11
+ "use_flash_attn": True,
12
+ "special_initializer": True,
13
+ "max_position_embeddings": 0,
14
+
15
+ "alt_mixer_layers": [1, 6, 11, 16, 21, 27, 33],
16
+ "alt_2_mixer_layers": [2, 7, 12, 17, 22, 28, 34],
17
+ "mixer": {
18
+ "_target_": "based.models.mixers.base_conv.BaseConvWithSiLU4",
19
+ "expand_proj": 4,
20
+ "l_max": 2048,
21
+ "kernel_sizes": [3],
22
+ },
23
+ "alt_mixer": {
24
+ "_target_": "based.models.mixers.linear_attn.LinearAttention",
25
+ "feature_dim": 16,
26
+ "l_max": 2048,
27
+ "num_heads": 16,
28
+ "num_key_value_heads": 16,
29
+ "train_view": "linear",
30
+ },
31
+ "alt_mixer_2": {
32
+ "_target_": "based.models.mixers.slide_fa2.SlidingsMHA",
33
+ "causal": True,
34
+ "num_heads": 16,
35
+ "window_size": 128,
36
+ }
37
+ }