| import torch | |
| import torch.nn as nn | |
| from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin | |
| from transformers.modeling_outputs import CausalLMOutput | |
| class SlopiestConfig(PretrainedConfig): | |
| model_type = "slopiest" | |
| def __init__(self, vocab_size=50257, embeddings_size=384, | |
| head_size=6, layer_size=6, block_size=256, **kwargs): | |
| super().__init__(**kwargs) | |
| self.vocab_size = vocab_size | |
| self.embeddings_size = embeddings_size | |
| self.head_size = head_size | |
| self.layer_size = layer_size | |
| self.block_size = block_size | |
| self.hidden_size = embeddings_size | |
| self.num_hidden_layers = layer_size | |
| self.num_attention_heads = head_size | |
| self.max_position_embeddings = block_size | |
| class SlopiestForCausalLM(PreTrainedModel, GenerationMixin): | |
| config_class = SlopiestConfig | |
| def __init__(self, config): | |
| super().__init__(config) | |
| self.token_embeddings = nn.Embedding(config.vocab_size, config.embeddings_size) | |
| self.positional_embeddings = nn.Embedding(config.block_size, config.embeddings_size) | |
| encoder_layer = nn.TransformerEncoderLayer( | |
| d_model=config.embeddings_size, | |
| nhead=config.head_size, | |
| dim_feedforward=config.embeddings_size * 4, | |
| batch_first=True, | |
| norm_first=True, | |
| activation="gelu", | |
| bias=False | |
| ) | |
| self.transformer = nn.TransformerEncoder( | |
| encoder_layer, | |
| num_layers=config.layer_size, | |
| norm=nn.LayerNorm(config.embeddings_size) | |
| ) | |
| self.lm_head = nn.Linear(config.embeddings_size, config.vocab_size, bias=False) | |
| self.post_init() | |
| def forward(self, input_ids, labels=None, **kwargs): | |
| batch, seq_len = input_ids.shape | |
| tok_emb = self.token_embeddings(input_ids) | |
| pos = torch.arange(seq_len, device=input_ids.device) | |
| pos_emb = self.positional_embeddings(pos) | |
| x = tok_emb + pos_emb | |
| mask = nn.Transformer.generate_square_subsequent_mask(seq_len, device=input_ids.device) | |
| x = self.transformer(x, mask=mask, is_causal=True) | |
| logits = self.lm_head(x) | |
| loss = None | |
| if labels is not None: | |
| loss = nn.functional.cross_entropy( | |
| logits.view(-1, logits.size(-1)), labels.view(-1) | |
| ) | |
| return CausalLMOutput(loss=loss, logits=logits) | |