hazyresearch
/

based-1.3b

Model card Files Files and versions

simarora commited on Feb 8, 2024

Commit

5df808e

·

verified ·

1 Parent(s): d7359c4

Update config.json

Files changed (1) hide show

config.json +13 -2

config.json CHANGED Viewed

@@ -1,28 +1,39 @@
-{
         "n_embd": 1792,
         "n_inner": 3584,
         "n_head": 16,
         "n_layer": 36,
         "activation_function": "swiglu",
         "resid_pdrop": 0.0,
         "residual_in_fp32": True,
         "pad_vocab_size_multiple": 8,
         "use_flash_attn": True,
         "special_initializer": True,
         "max_position_embeddings": 0,
         "alt_mixer_layers": [1, 6, 11, 16, 21, 27, 33],
-        "alt_2_mixer_layers": [2, 7, 12, 17, 22, 28, 34],
         "mixer": {
             "_target_": "based.models.mixers.base_conv.BaseConvWithSiLU4",
             "expand_proj": 4,
             "l_max": 2048,
             "kernel_sizes": [3],
         },
         "alt_mixer": {
             "_target_": "based.models.mixers.linear_attn.LinearAttention",
             "feature_dim": 16,
             "l_max": 2048,
             "num_heads": 16,
             "num_key_value_heads": 16,

+config_check = {
         "n_embd": 1792,
         "n_inner": 3584,
         "n_head": 16,
         "n_layer": 36,
+        "mlp_fc1_bias": False,
+        "mlp_fc2_bias": False,
+        "out_proj_bias": False,
+        "qkv_proj_bias": False,
+        "reorder_and_upcast_attn": False,
+        "scale_attn_by_inverse_layer_idx": False,
         "activation_function": "swiglu",
         "resid_pdrop": 0.0,
+        "rms_norm": True,
         "residual_in_fp32": True,
         "pad_vocab_size_multiple": 8,
         "use_flash_attn": True,
         "special_initializer": True,
+        "rotary_emb_fraction": 1,
         "max_position_embeddings": 0,
         "alt_mixer_layers": [1, 6, 11, 16, 21, 27, 33],
+        "alt_mixer_2_layers": [2, 7, 12, 17, 22, 28, 34],
         "mixer": {
             "_target_": "based.models.mixers.base_conv.BaseConvWithSiLU4",
             "expand_proj": 4,
             "l_max": 2048,
             "kernel_sizes": [3],
+            "use_bias": True,
         },
         "alt_mixer": {
             "_target_": "based.models.mixers.linear_attn.LinearAttention",
             "feature_dim": 16,
+            "feature_name": "taylor_exp",
             "l_max": 2048,
             "num_heads": 16,
             "num_key_value_heads": 16,