Slopiest-49M / model.py
rudyon's picture
Initial release of the Slopiest-49M Base Model
327d3a1 verified
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig, GenerationMixin
from transformers.modeling_outputs import CausalLMOutput
class SlopiestConfig(PretrainedConfig):
model_type = "slopiest"
def __init__(self, vocab_size=50257, embeddings_size=384,
head_size=6, layer_size=6, block_size=256, **kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.embeddings_size = embeddings_size
self.head_size = head_size
self.layer_size = layer_size
self.block_size = block_size
self.hidden_size = embeddings_size
self.num_hidden_layers = layer_size
self.num_attention_heads = head_size
self.max_position_embeddings = block_size
class SlopiestForCausalLM(PreTrainedModel, GenerationMixin):
config_class = SlopiestConfig
def __init__(self, config):
super().__init__(config)
self.token_embeddings = nn.Embedding(config.vocab_size, config.embeddings_size)
self.positional_embeddings = nn.Embedding(config.block_size, config.embeddings_size)
encoder_layer = nn.TransformerEncoderLayer(
d_model=config.embeddings_size,
nhead=config.head_size,
dim_feedforward=config.embeddings_size * 4,
batch_first=True,
norm_first=True,
activation="gelu",
bias=False
)
self.transformer = nn.TransformerEncoder(
encoder_layer,
num_layers=config.layer_size,
norm=nn.LayerNorm(config.embeddings_size)
)
self.lm_head = nn.Linear(config.embeddings_size, config.vocab_size, bias=False)
self.post_init()
def forward(self, input_ids, labels=None, **kwargs):
batch, seq_len = input_ids.shape
tok_emb = self.token_embeddings(input_ids)
pos = torch.arange(seq_len, device=input_ids.device)
pos_emb = self.positional_embeddings(pos)
x = tok_emb + pos_emb
mask = nn.Transformer.generate_square_subsequent_mask(seq_len, device=input_ids.device)
x = self.transformer(x, mask=mask, is_causal=True)
logits = self.lm_head(x)
loss = None
if labels is not None:
loss = nn.functional.cross_entropy(
logits.view(-1, logits.size(-1)), labels.view(-1)
)
return CausalLMOutput(loss=loss, logits=logits)