| arch: | |
| type: TransformerLMHeadModel | |
| args: | |
| transformer_config: | |
| type: TransformerDecoderOnlyModel | |
| args: | |
| embed_config: | |
| type: TransformerEmbeddingBlock | |
| args: | |
| token_embed_config: | |
| type: TokenEmbedding | |
| args: | |
| n_embed: 2048 | |
| n_vocab: 32256 | |
| pos_embed_config: null | |
| type_embed_config: null | |
| ln_config: null | |
| p_drop_embed: 0.0 | |
| concat_strategy: id_first | |
| decoder_config: | |
| type: TransformerDecoderBlock | |
| args: | |
| attn_config: | |
| type: LlamaAttention | |
| args: | |
| n_embed: 2048 | |
| n_pos: 16384 | |
| n_head: 16 | |
| n_key_value_head: 16 | |
| head_size: 128 | |
| p_drop_attn: 0.0 | |
| p_drop_resid: 0.0 | |
| bias_attn: false | |
| bias_proj: false | |
| cross_attn: false | |
| scale_dot_product: true | |
| scale_layer_wise: false | |
| layer_idx: null | |
| rope_config: | |
| type: RotaryPositionEmbedding | |
| args: | |
| head_size: 128 | |
| n_pos: 16384 | |
| base: 100000 | |
| scaling_type: linear | |
| scaling_factor: 4.0 | |
| mlp_config: | |
| type: LlamaMLP | |
| args: | |
| n_embed: 2048 | |
| n_inner: 5504 | |
| act_fn_config: | |
| type: SiLUActivation | |
| args: {} | |
| ln_config: | |
| type: LlamaRMSNorm | |
| args: | |
| n_embed: 2048 | |
| ln_eps: 1.0e-06 | |
| n_embed: 2048 | |
| post_norm: false | |
| add_cross_attn: false | |
| n_embed: 2048 | |
| n_layer: 24 | |
| n_head: 16 | |
| ln_config: | |
| type: LlamaRMSNorm | |
| args: | |
| n_embed: 2048 | |
| ln_eps: 1.0e-06 | |
| perform_linear_bias: false | |
| attn_window_size_loop_unit: null | |
| lm_head_config: | |
| type: TransformerLMHead | |
| args: | |
| n_vocab: 32256 | |
| n_embed: 2048 | |
| perform_transform: false | |
| act_fn_config: null | |
| ln_config: null | |