| from transformers import PretrainedConfig | |
| class SykoConfig(PretrainedConfig): | |
| model_type = "sykollm" | |
| def __init__( | |
| self, | |
| vocab_size=32000, | |
| d_model=768, | |
| n_layers=24, | |
| n_heads=6, | |
| num_memory_tokens=16, | |
| num_global_memory_tokens=32, | |
| intermediate_size=3072, | |
| chunk_size=128, | |
| context_size=1024, | |
| **kwargs | |
| ): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.d_model = d_model | |
| self.n_layers = n_layers | |
| self.n_heads = n_heads | |
| self.num_memory_tokens = num_memory_tokens | |
| self.num_global_memory_tokens = num_global_memory_tokens | |
| self.intermediate_size = intermediate_size | |
| self.chunk_size = chunk_size | |
| self.context_size = context_size | |