| model_name: shc-evo2-20b |
|
|
| vocab_size: 512 |
| hidden_size: 8192 |
| |
| num_filters: 8192 |
| hcl_layer_idxs: [2,6,9,13,16,20,23] |
| hcm_layer_idxs: [1,5,8,12,15,19,22] |
| hcs_layer_idxs: [0,4,7,11,14,18,21] |
| attn_layer_idxs: [3,10,17] |
| hcm_filter_length: 128 |
| hcl_filter_groups: 8192 |
| hcm_filter_groups: 512 |
| hcs_filter_groups: 512 |
| hcs_filter_length: 7 |
| num_layers: 24 |
|
|
| |
| short_filter_length: 3 |
| num_attention_heads: 64 |
| short_filter_bias: false |
| mlp_init_method: torch.nn.init.zeros_ |
| mlp_output_init_method: torch.nn.init.zeros_ |
| eps: 0.000001 |
| state_size: 16 |
| rotary_emb_base: 1000000 |
| rotary_emb_scaling_factor: 128 |
| use_interpolated_rotary_pos_emb: True |
| make_vocab_size_divisible_by: 8 |
| inner_size_multiple_of: 128 |
| inner_mlp_size: 22528 |
| log_intermediate_values: False |
| |
| proj_groups: 1 |
| |
| hyena_filter_groups: 1 |
| |
| column_split_hyena: False |
| column_split: True |
| interleave: True |
| |
| evo2_style_activations: True |
|
|
| use_fp8_input_projections: True |
|
|
| |
| model_parallel_size: 1 |
| pipe_parallel_size: 1 |
| tie_embeddings: True |
| mha_out_proj_bias: True |
| hyena_out_proj_bias: True |
| hyena_flip_x1x2: False |
| qkv_proj_bias: False |
| max_seqlen: 1048576 |
| max_batch_size: 1 |
| final_norm: True |
| use_flash_attn: True |
| use_flash_rmsnorm: False |
| use_flash_depthwise: False |
| use_flashfft: False |
| use_laughing_hyena: False |
| inference_mode: True |
| tokenizer_type: CharLevelTokenizer |
| prefill_style: fft |
| mlp_activation: gelu |
| print_activations: False |
|
|