| arch: | |
| type: TransformerLMHeadModel | |
| args: | |
| transformer_config: | |
| type: TransformerDecoderOnlyModel | |
| args: | |
| embed_config: | |
| type: TransformerEmbeddingBlock | |
| args: | |
| token_embed_config: | |
| type: TokenEmbedding | |
| args: | |
| n_embed: 2048 | |
| n_vocab: 51200 | |
| pos_embed_config: null | |
| type_embed_config: null | |
| ln_config: null | |
| p_drop_embed: 0.0 | |
| concat_strategy: id_first | |
| decoder_config: | |
| type: ParallelTransformerDecoderBlock | |
| args: | |
| attn_config: | |
| type: GPTNeoXAttention | |
| args: | |
| n_embed: 2048 | |
| n_pos: 2048 | |
| n_head: 32 | |
| n_key_value_head: 32 | |
| head_size: 64 | |
| p_drop_attn: 0.0 | |
| p_drop_resid: 0.0 | |
| bias_attn: true | |
| bias_proj: true | |
| cross_attn: false | |
| scale_dot_product: true | |
| scale_layer_wise: false | |
| layer_idx: null | |
| rope_config: | |
| type: MistralRotaryEmbedding | |
| args: | |
| rotary_head_size: 32 | |
| n_pos: 2048 | |
| base: 10000 | |
| scaling_type: null | |
| scaling_factor: null | |
| perform_bloom_split_head: false | |
| mlp_config: | |
| type: TransformerMLP | |
| args: | |
| n_embed: 2048 | |
| n_inner: 8192 | |
| act_fn_config: | |
| type: NewGELUActivation | |
| args: {} | |
| p_drop_mlp: 0.0 | |
| ln_config: | |
| type: LayerNorm | |
| args: | |
| n_embed: 2048 | |
| ln_eps: 1.0e-05 | |
| n_embed: 2048 | |
| post_norm: false | |
| add_cross_attn: false | |
| share_layer_norm: true | |
| n_embed: 2048 | |
| n_layer: 24 | |
| n_head: 32 | |
| ln_config: | |
| type: LayerNorm | |
| args: | |
| n_embed: 2048 | |
| ln_eps: 1.0e-05 | |
| perform_linear_bias: false | |
| attn_window_size_loop_unit: null | |
| lm_head_config: | |
| type: TransformerLMHead | |
| args: | |
| n_vocab: 51200 | |
| n_embed: 2048 | |
| bias_lm_head: true | |
| perform_transform: false | |
| act_fn_config: null | |
| ln_config: null | |