| from transformers import PretrainedConfig | |
| class UltraBaseConfig(PretrainedConfig): | |
| model_type = "ultrabase" | |
| def __init__( | |
| self, | |
| vocab_size=49152, | |
| d_model=256, | |
| n_layers=16, | |
| n_heads=12, | |
| latent_dim=64, | |
| head_dim=16, | |
| bypass_rate=0.375, | |
| num_private_experts=6, | |
| num_shared_experts=1, | |
| d_ff=256, | |
| bos_token_id=0, | |
| eos_token_id=0, | |
| tie_word_embeddings=True, | |
| **kwargs | |
| ): | |
| super().__init__( | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| tie_word_embeddings=tie_word_embeddings, | |
| **kwargs | |
| ) | |
| self.vocab_size = vocab_size | |
| self.d_model = d_model | |
| self.n_layers = n_layers | |
| self.n_heads = n_heads | |
| self.latent_dim = latent_dim | |
| self.head_dim = head_dim | |
| self.bypass_rate = bypass_rate | |
| self.num_private_experts = num_private_experts | |
| self.num_shared_experts = num_shared_experts | |
| self.d_ff = d_ff | |