Lanni-ni's picture
add remote code + model files
c65d6a9 verified
ForgettingTransformerForCausalLM(
(model): ForgettingTransformerModel(
(embeddings): Embedding(50277, 384)
(layers): ModuleList(
(0-3): 4 x ForgettingTransformerBlock(
(attn_norm): RMSNorm(384, eps=1e-06)
(attn): ForgettingAttentionLayer(
(q_proj): Linear(in_features=384, out_features=384, bias=False)
(k_proj): Linear(in_features=384, out_features=384, bias=False)
(v_proj): Linear(in_features=384, out_features=384, bias=False)
(o_proj): Linear(in_features=384, out_features=384, bias=False)
(fgate_proj): Linear(in_features=384, out_features=6, bias=True)
)
(mlp_norm): RMSNorm(384, eps=1e-06)
(mlp): ForgettingTransformerMLP(
(gate_proj): Linear(in_features=384, out_features=2048, bias=False)
(down_proj): Linear(in_features=1024, out_features=384, bias=False)
(act_fn): SiLU()
)
)
)
(norm): RMSNorm(384, eps=1e-06)
)
(lm_head): Linear(in_features=384, out_features=50277, bias=False)
)