arcinstitute
/

evo2_20b

Model card Files Files and versions

gbrixi commited on Feb 6

Commit

205923d

·

verified ·

1 Parent(s): 46ea8db

Add model configuration

Files changed (1) hide show

config.yml +66 -0

config.yml ADDED Viewed

	@@ -0,0 +1,66 @@

+model_name: shc-evo2-20b
+vocab_size: 512
+hidden_size: 8192
+# Number of independent filters in Hyena-LI
+num_filters: 8192
+hcl_layer_idxs: [2,6,9,13,16,20,23]
+hcm_layer_idxs: [1,5,8,12,15,19,22]
+hcs_layer_idxs: [0,4,7,11,14,18,21]
+attn_layer_idxs: [3,10,17]
+hcm_filter_length: 128
+hcl_filter_groups: 8192
+hcm_filter_groups: 512
+hcs_filter_groups: 512
+hcs_filter_length: 7
+num_layers: 24
+# Length of the short, depthwise FIR applied to input projections
+short_filter_length: 3
+num_attention_heads: 64
+short_filter_bias: false # add bias to FIR
+mlp_init_method: torch.nn.init.zeros_
+mlp_output_init_method: torch.nn.init.zeros_
+eps: 0.000001
+state_size: 16
+rotary_emb_base: 1000000
+rotary_emb_scaling_factor: 128
+use_interpolated_rotary_pos_emb: True
+make_vocab_size_divisible_by: 8
+inner_size_multiple_of: 128  # force GLU inner_size to be a multiple of
+inner_mlp_size: 22528
+log_intermediate_values: False
+# Number of groups in GQA
+proj_groups: 1
+# Number of groups in grouped
+hyena_filter_groups: 1
+# Split strategy for channels
+column_split_hyena: False
+column_split: True
+interleave: True
+# Layer > 0 nn.identity activation
+evo2_style_activations: True
+use_fp8_input_projections: True
+# Legacy options for MP / PP inference
+model_parallel_size: 1
+pipe_parallel_size: 1
+tie_embeddings: True
+mha_out_proj_bias: True
+hyena_out_proj_bias: True
+hyena_flip_x1x2: False
+qkv_proj_bias: False
+max_seqlen: 1048576
+max_batch_size: 1
+final_norm: True
+use_flash_attn: True
+use_flash_rmsnorm: False
+use_flash_depthwise: False
+use_flashfft: False
+use_laughing_hyena: False
+inference_mode: True
+tokenizer_type: CharLevelTokenizer
+prefill_style: fft
+mlp_activation: gelu
+print_activations: False