| model: | |
| name: llama | |
| architecture: transformer | |
| vocab_size: 32000 | |
| hidden_size: 4096 | |
| num_attention_heads: 32 | |
| num_hidden_layers: 32 | |
| intermediate_size: 11008 | |
| activation_function: swiglu | |
| max_position_embeddings: 2048 | |
| initializer_range: 0.02 | |
| layer_norm_eps: 1e-5 | |
| pad_token_id: 0 | |
| bos_token_id: 1 | |
| eos_token_id: 2 | |
| tie_word_embeddings: false | |
| rotary_embedding_base: 10000 | |
| attention_dropout: 0.0 | |
| hidden_dropout: 0.0 | |
| use_cache: true |