model: name: llama architecture: transformer vocab_size: 32000 hidden_size: 4096 num_attention_heads: 32 num_hidden_layers: 32 intermediate_size: 11008 activation_function: swiglu max_position_embeddings: 2048 initializer_range: 0.02 layer_norm_eps: 1e-5 pad_token_id: 0 bos_token_id: 1 eos_token_id: 2 tie_word_embeddings: false rotary_embedding_base: 10000 attention_dropout: 0.0 hidden_dropout: 0.0 use_cache: true