from transformers import PretrainedConfig class UltraBaseConfig(PretrainedConfig): model_type = "ultrabase" def __init__( self, vocab_size=49152, d_model=256, n_layers=16, n_heads=12, latent_dim=64, head_dim=16, bypass_rate=0.375, num_private_experts=6, num_shared_experts=1, d_ff=256, bos_token_id=0, eos_token_id=0, tie_word_embeddings=True, **kwargs ): super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs ) self.vocab_size = vocab_size self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.latent_dim = latent_dim self.head_dim = head_dim self.bypass_rate = bypass_rate self.num_private_experts = num_private_experts self.num_shared_experts = num_shared_experts self.d_ff = d_ff