| arch: | |
| type: TransformerLMHeadModel | |
| args: | |
| transformer_config: | |
| type: TransformerDecoderOnlyModel | |
| args: | |
| embed_config: | |
| type: TransformerEmbeddingBlock | |
| args: | |
| token_embed_config: | |
| type: TokenEmbedding | |
| args: | |
| n_embed: 768 | |
| n_vocab: 50257 | |
| pos_embed_config: | |
| type: PositionEmbedding | |
| args: | |
| n_embed: 768 | |
| n_pos: 1024 | |
| type_embed_config: null | |
| ln_config: null | |
| p_drop_embed: 0.1 | |
| concat_strategy: id_first | |
| decoder_config: | |
| type: TransformerDecoderBlock | |
| args: | |
| attn_config: | |
| type: MultiHeadKeyValueAttention | |
| args: | |
| n_embed: 768 | |
| n_pos: 1024 | |
| n_head: 12 | |
| head_size: 64 | |
| p_drop_attn: 0.1 | |
| p_drop_resid: 0.1 | |
| bias_attn: true | |
| bias_proj: true | |
| cross_attn: false | |
| scale_dot_product: true | |
| scale_layer_wise: false | |
| layer_idx: null | |
| perform_linear_bias: false | |
| perform_bloom_split_head: false | |
| perform_query_scaling: false | |
| attn_window_size: null | |
| mlp_config: | |
| type: TransformerMLP | |
| args: | |
| n_embed: 768 | |
| n_inner: 3072 | |
| act_fn_config: | |
| type: NewGELUActivation | |
| args: {} | |
| p_drop_mlp: 0.1 | |
| ln_config: | |
| type: LayerNorm | |
| args: | |
| n_embed: 768 | |
| ln_eps: 1.0e-05 | |
| n_embed: 768 | |
| post_norm: false | |
| add_cross_attn: false | |
| n_embed: 768 | |
| n_layer: 12 | |
| n_head: 12 | |
| ln_config: | |
| type: LayerNorm | |
| args: | |
| n_embed: 768 | |
| ln_eps: 1.0e-05 | |
| perform_linear_bias: false | |
| attn_window_size_loop_unit: null | |
| lm_head_config: | |
| type: TransformerLMHead | |
| args: | |
| n_vocab: 50257 | |
| n_embed: 768 | |
| perform_transform: false | |
| act_fn_config: null | |
| ln_config: null | |