from transformers import PretrainedConfig class QuarkConfig(PretrainedConfig): model_type = "quark" def __init__(self, vocab_size=65537, d_model=768, n_heads=12, n_kv_heads=4, n_layers=32, d_ff=2048, head_dim=64, max_seq_len=2048, rope_theta=10000.0, rms_eps=1e-5, qkv_bias=True, dropout=0.0, **kwargs): self.vocab_size=vocab_size; self.d_model=d_model; self.n_heads=n_heads self.n_kv_heads=n_kv_heads; self.n_layers=n_layers; self.d_ff=d_ff self.head_dim=head_dim; self.max_seq_len=max_seq_len; self.rope_theta=rope_theta self.rms_eps=rms_eps; self.qkv_bias=qkv_bias; self.dropout=dropout super().__init__(**kwargs)