import torch import torch.nn as nn from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin from transformers.modeling_outputs import CausalLMOutput class SlopiestConfig(PretrainedConfig): model_type = "slopiest" def __init__(self, vocab_size=50257, embeddings_size=384, head_size=6, layer_size=6, block_size=256, **kwargs): super().__init__(**kwargs) self.vocab_size = vocab_size self.embeddings_size = embeddings_size self.head_size = head_size self.layer_size = layer_size self.block_size = block_size self.hidden_size = embeddings_size self.num_hidden_layers = layer_size self.num_attention_heads = head_size self.max_position_embeddings = block_size class SlopiestForCausalLM(PreTrainedModel, GenerationMixin): config_class = SlopiestConfig def __init__(self, config): super().__init__(config) self.token_embeddings = nn.Embedding(config.vocab_size, config.embeddings_size) self.positional_embeddings = nn.Embedding(config.block_size, config.embeddings_size) encoder_layer = nn.TransformerEncoderLayer( d_model=config.embeddings_size, nhead=config.head_size, dim_feedforward=config.embeddings_size * 4, batch_first=True, norm_first=True, activation="gelu", bias=False ) self.transformer = nn.TransformerEncoder( encoder_layer, num_layers=config.layer_size, norm=nn.LayerNorm(config.embeddings_size) ) self.lm_head = nn.Linear(config.embeddings_size, config.vocab_size, bias=False) self.post_init() def forward(self, input_ids, labels=None, **kwargs): batch, seq_len = input_ids.shape tok_emb = self.token_embeddings(input_ids) pos = torch.arange(seq_len, device=input_ids.device) pos_emb = self.positional_embeddings(pos) x = tok_emb + pos_emb mask = nn.Transformer.generate_square_subsequent_mask(seq_len, device=input_ids.device) x = self.transformer(x, mask=mask, is_causal=True) logits = self.lm_head(x) loss = None if labels is not None: loss = nn.functional.cross_entropy( logits.view(-1, logits.size(-1)), labels.view(-1) ) return CausalLMOutput(loss=loss, logits=logits)