| """ |
| modeling_randygpt.py β PyTorch implementation of the randyGPT architecture. |
| |
| Matches the Rust forward pass exactly: |
| - RMSNorm with no learnable parameters (x / rms(x)) |
| - Multi-head causal self-attention (no bias, scaled dot-product) |
| - MLP with squared-ReLU activation and 4Γ expansion |
| - No final layer norm before lm_head |
| - Learned token + position embeddings |
| |
| Compatible with HuggingFace transformers via PreTrainedModel/PretrainedConfig. |
| |
| Usage (standalone): |
| from modeling_randygpt import RandyGPTConfig, RandyGPTForCausalLM |
| from safetensors.torch import load_file |
| |
| cfg = RandyGPTConfig.from_pretrained("path/to/model") |
| model = RandyGPTForCausalLM(cfg) |
| model.load_state_dict(load_file("path/to/model.safetensors"), strict=True) |
| model.eval() |
| |
| Usage (HuggingFace Hub): |
| model = RandyGPTForCausalLM.from_pretrained("username/randygpt-s") |
| """ |
|
|
| import math |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from transformers import PretrainedConfig, PreTrainedModel |
| from transformers.modeling_outputs import CausalLMOutput |
|
|
|
|
| |
|
|
| class RandyGPTConfig(PretrainedConfig): |
| model_type = "randygpt" |
|
|
| def __init__( |
| self, |
| vocab_size: int = 1500, |
| n_embd: int = 128, |
| n_head: int = 4, |
| n_layer: int = 8, |
| block_size: int = 256, |
| bos_token_id: int = 0, |
| eos_token_id: int = 1, |
| **kwargs, |
| ): |
| super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) |
| self.vocab_size = vocab_size |
| self.n_embd = n_embd |
| self.n_head = n_head |
| self.n_layer = n_layer |
| self.block_size = block_size |
| self.head_dim = n_embd // n_head |
| self.mlp_dim = 4 * n_embd |
|
|
|
|
| |
|
|
| def rmsnorm(x: torch.Tensor, eps: float = 1e-5) -> torch.Tensor: |
| """RMSNorm with no learnable scale β matches Rust rmsnorm_fwd exactly.""" |
| return x * (x.pow(2).mean(-1, keepdim=True) + eps).rsqrt() |
|
|
|
|
| class CausalSelfAttention(nn.Module): |
| def __init__(self, cfg: RandyGPTConfig): |
| super().__init__() |
| self.n_head = cfg.n_head |
| self.head_dim = cfg.head_dim |
| self.n_embd = cfg.n_embd |
| self.scale = 1.0 / math.sqrt(cfg.head_dim) |
|
|
| |
| self.wq = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False) |
| self.wk = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False) |
| self.wv = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False) |
| self.wo = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False) |
|
|
| def forward(self, x: torch.Tensor) -> torch.Tensor: |
| B, T, C = x.shape |
|
|
| q = self.wq(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) |
| k = self.wk(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) |
| v = self.wv(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) |
|
|
| |
| scores = q @ k.transpose(-2, -1) * self.scale |
| mask = torch.full((T, T), float('-inf'), device=x.device).triu(1) |
| scores = scores + mask |
|
|
| attn = F.softmax(scores, dim=-1) |
| out = attn @ v |
| out = out.transpose(1, 2).contiguous().view(B, T, C) |
| return self.wo(out) |
|
|
|
|
| class MLP(nn.Module): |
| def __init__(self, cfg: RandyGPTConfig): |
| super().__init__() |
| self.fc1 = nn.Linear(cfg.n_embd, cfg.mlp_dim, bias=False) |
| self.fc2 = nn.Linear(cfg.mlp_dim, cfg.n_embd, bias=False) |
|
|
| def forward(self, x: torch.Tensor) -> torch.Tensor: |
| h = self.fc1(x) |
| h = F.relu(h).pow(2) |
| return self.fc2(h) |
|
|
|
|
| class TransformerBlock(nn.Module): |
| def __init__(self, cfg: RandyGPTConfig): |
| super().__init__() |
| self.attn = CausalSelfAttention(cfg) |
| self.mlp = MLP(cfg) |
|
|
| def forward(self, x: torch.Tensor) -> torch.Tensor: |
| x = x + self.attn(rmsnorm(x)) |
| x = x + self.mlp(rmsnorm(x)) |
| return x |
|
|
|
|
| |
|
|
| class RandyGPTForCausalLM(PreTrainedModel): |
| config_class = RandyGPTConfig |
|
|
| def __init__(self, cfg: RandyGPTConfig): |
| super().__init__(cfg) |
| self.wte = nn.Embedding(cfg.vocab_size, cfg.n_embd) |
| self.wpe = nn.Embedding(cfg.block_size, cfg.n_embd) |
| self.layers = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layer)]) |
| self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False) |
| |
|
|
| def forward( |
| self, |
| input_ids: torch.Tensor, |
| labels: torch.Tensor = None, |
| **kwargs, |
| ) -> CausalLMOutput: |
| B, T = input_ids.shape |
| assert T <= self.config.block_size, f"Sequence length {T} > block_size {self.config.block_size}" |
|
|
| pos = torch.arange(T, device=input_ids.device).unsqueeze(0) |
| x = self.wte(input_ids) + self.wpe(pos) |
|
|
| for block in self.layers: |
| x = block(x) |
|
|
| logits = self.lm_head(x) |
|
|
| loss = None |
| if labels is not None: |
| loss = F.cross_entropy( |
| logits.view(-1, self.config.vocab_size), |
| labels.view(-1), |
| ignore_index=-100, |
| ) |
|
|
| return CausalLMOutput(loss=loss, logits=logits) |
|
|
| @torch.no_grad() |
| def generate_text( |
| self, |
| prompt_ids: torch.Tensor, |
| max_new_tokens: int = 200, |
| temperature: float = 0.8, |
| top_p: float = 0.9, |
| ) -> torch.Tensor: |
| """Simple top-p nucleus sampling. Returns full sequence including prompt.""" |
| self.eval() |
| ids = prompt_ids.clone() |
|
|
| for _ in range(max_new_tokens): |
| ctx = ids[:, -self.config.block_size:] |
| out = self(ctx) |
| logits = out.logits[:, -1, :] / temperature |
|
|
| |
| probs = F.softmax(logits, dim=-1) |
| sorted_probs, sorted_idx = torch.sort(probs, descending=True) |
| cumprobs = sorted_probs.cumsum(dim=-1) |
| mask = cumprobs - sorted_probs > top_p |
| sorted_probs[mask] = 0.0 |
| sorted_probs /= sorted_probs.sum() |
| next_id = sorted_idx[0, torch.multinomial(sorted_probs[0], 1)] |
| ids = torch.cat([ids, next_id.view(1, 1)], dim=1) |
|
|
| return ids |
|
|