from transformers import PretrainedConfig class TinyGPTConfig(PretrainedConfig): model_type = "basemini" def __init__( self, vocab_size=32768, ctx_len=512, n_layer=4, n_head=4, n_embd=384, dropout=0.0, attention_backend="torch", torch_fallback=False, pad_token_id=None, bos_token_id=None, eos_token_id=None, sep_token_id=None, unk_token_id=None, **kwargs, ): super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, sep_token_id=sep_token_id, unk_token_id=unk_token_id, **kwargs, ) if attention_backend not in ("sage", "torch", "flash2", "flash3"): raise ValueError("attention_backend must be sage, torch, flash2 or flash3") self.vocab_size = int(vocab_size) self.ctx_len = int(ctx_len) self.max_position_embeddings = int(ctx_len) self.n_layer = int(n_layer) self.n_head = int(n_head) self.n_embd = int(n_embd) self.num_hidden_layers = int(n_layer) self.num_attention_heads = int(n_head) self.hidden_size = int(n_embd) self.dropout = float(dropout) self.attention_backend = str(attention_backend) self.available_attention_backends = ["sage", "torch", "flash2", "flash3"] self.torch_fallback = bool(torch_fallback)