AlibiForCausalLM( (model): AlibiModel( (embeddings): Embedding(50277, 256) (layers): ModuleList( (0-1): 2 x TransformerBlock( (attn_norm): RMSNorm(256, eps=1e-06) (attn): Attention( (q_proj): Linear(in_features=256, out_features=256, bias=False) (k_proj): Linear(in_features=256, out_features=256, bias=False) (v_proj): Linear(in_features=256, out_features=256, bias=False) (o_proj): Linear(in_features=256, out_features=256, bias=False) ) (mlp_norm): RMSNorm(256, eps=1e-06) (mlp): TransformerMLP( (gate_proj): Linear(in_features=256, out_features=1536, bias=False) (down_proj): Linear(in_features=768, out_features=256, bias=False) (act_fn): SiLU() ) ) ) (norm): RMSNorm(256, eps=1e-06) ) (lm_head): Linear(in_features=256, out_features=50277, bias=False) )