from transformers import PretrainedConfig class DwarfConfig(PretrainedConfig): model_type = "dwarf" def __init__(self, vocab_size=8202, d_model=320, n_layers=12, n_heads=5, n_kv_heads=1, d_ff=864, max_seq_len=2048, rope_theta=10000.0, norm_eps=1e-5, head_dim=64, **kwargs): self.vocab_size = vocab_size self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.n_kv_heads = n_kv_heads self.d_ff = d_ff self.max_seq_len = max_seq_len self.rope_theta = rope_theta self.norm_eps = norm_eps self.head_dim = head_dim self.num_hidden_layers = n_layers self.hidden_size = d_model self.num_attention_heads = n_heads self.num_key_value_heads = n_kv_heads super().__init__(**kwargs)