randygpt-s / modeling_randygpt.py
rsafier's picture
Upload folder using huggingface_hub
ebccc4e verified
"""
modeling_randygpt.py β€” PyTorch implementation of the randyGPT architecture.
Matches the Rust forward pass exactly:
- RMSNorm with no learnable parameters (x / rms(x))
- Multi-head causal self-attention (no bias, scaled dot-product)
- MLP with squared-ReLU activation and 4Γ— expansion
- No final layer norm before lm_head
- Learned token + position embeddings
Compatible with HuggingFace transformers via PreTrainedModel/PretrainedConfig.
Usage (standalone):
from modeling_randygpt import RandyGPTConfig, RandyGPTForCausalLM
from safetensors.torch import load_file
cfg = RandyGPTConfig.from_pretrained("path/to/model")
model = RandyGPTForCausalLM(cfg)
model.load_state_dict(load_file("path/to/model.safetensors"), strict=True)
model.eval()
Usage (HuggingFace Hub):
model = RandyGPTForCausalLM.from_pretrained("username/randygpt-s")
"""
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutput
# ── Config ────────────────────────────────────────────────────────────────────
class RandyGPTConfig(PretrainedConfig):
model_type = "randygpt"
def __init__(
self,
vocab_size: int = 1500,
n_embd: int = 128,
n_head: int = 4,
n_layer: int = 8,
block_size: int = 256,
bos_token_id: int = 0,
eos_token_id: int = 1,
**kwargs,
):
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.n_embd = n_embd
self.n_head = n_head
self.n_layer = n_layer
self.block_size = block_size
self.head_dim = n_embd // n_head
self.mlp_dim = 4 * n_embd
# ── Modules ───────────────────────────────────────────────────────────────────
def rmsnorm(x: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
"""RMSNorm with no learnable scale β€” matches Rust rmsnorm_fwd exactly."""
return x * (x.pow(2).mean(-1, keepdim=True) + eps).rsqrt()
class CausalSelfAttention(nn.Module):
def __init__(self, cfg: RandyGPTConfig):
super().__init__()
self.n_head = cfg.n_head
self.head_dim = cfg.head_dim
self.n_embd = cfg.n_embd
self.scale = 1.0 / math.sqrt(cfg.head_dim)
# No bias β€” matches Rust linear_fwd (pure matmul)
self.wq = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
self.wk = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
self.wv = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
self.wo = nn.Linear(cfg.n_embd, cfg.n_embd, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, T, C = x.shape
q = self.wq(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2) # [B,H,T,Dh]
k = self.wk(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
v = self.wv(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
# Causal attention scores
scores = q @ k.transpose(-2, -1) * self.scale # [B,H,T,T]
mask = torch.full((T, T), float('-inf'), device=x.device).triu(1)
scores = scores + mask
attn = F.softmax(scores, dim=-1)
out = attn @ v # [B,H,T,Dh]
out = out.transpose(1, 2).contiguous().view(B, T, C) # [B,T,D]
return self.wo(out)
class MLP(nn.Module):
def __init__(self, cfg: RandyGPTConfig):
super().__init__()
self.fc1 = nn.Linear(cfg.n_embd, cfg.mlp_dim, bias=False)
self.fc2 = nn.Linear(cfg.mlp_dim, cfg.n_embd, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
h = self.fc1(x)
h = F.relu(h).pow(2) # squared ReLU β€” matches Rust exactly
return self.fc2(h)
class TransformerBlock(nn.Module):
def __init__(self, cfg: RandyGPTConfig):
super().__init__()
self.attn = CausalSelfAttention(cfg)
self.mlp = MLP(cfg)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x + self.attn(rmsnorm(x)) # pre-norm attention + residual
x = x + self.mlp(rmsnorm(x)) # pre-norm MLP + residual
return x
# ── Full model ────────────────────────────────────────────────────────────────
class RandyGPTForCausalLM(PreTrainedModel):
config_class = RandyGPTConfig
def __init__(self, cfg: RandyGPTConfig):
super().__init__(cfg)
self.wte = nn.Embedding(cfg.vocab_size, cfg.n_embd)
self.wpe = nn.Embedding(cfg.block_size, cfg.n_embd)
self.layers = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layer)])
self.lm_head = nn.Linear(cfg.n_embd, cfg.vocab_size, bias=False)
# No final layer norm β€” matches Rust forward pass
def forward(
self,
input_ids: torch.Tensor, # [B, T]
labels: torch.Tensor = None, # [B, T] for loss
**kwargs,
) -> CausalLMOutput:
B, T = input_ids.shape
assert T <= self.config.block_size, f"Sequence length {T} > block_size {self.config.block_size}"
pos = torch.arange(T, device=input_ids.device).unsqueeze(0) # [1, T]
x = self.wte(input_ids) + self.wpe(pos) # [B, T, D]
for block in self.layers:
x = block(x)
logits = self.lm_head(x) # [B, T, V] β€” no final norm, matches Rust
loss = None
if labels is not None:
loss = F.cross_entropy(
logits.view(-1, self.config.vocab_size),
labels.view(-1),
ignore_index=-100,
)
return CausalLMOutput(loss=loss, logits=logits)
@torch.no_grad()
def generate_text(
self,
prompt_ids: torch.Tensor, # [1, T] β€” already tokenized
max_new_tokens: int = 200,
temperature: float = 0.8,
top_p: float = 0.9,
) -> torch.Tensor:
"""Simple top-p nucleus sampling. Returns full sequence including prompt."""
self.eval()
ids = prompt_ids.clone()
for _ in range(max_new_tokens):
ctx = ids[:, -self.config.block_size:]
out = self(ctx)
logits = out.logits[:, -1, :] / temperature # [1, V]
# Top-p (nucleus) sampling
probs = F.softmax(logits, dim=-1)
sorted_probs, sorted_idx = torch.sort(probs, descending=True)
cumprobs = sorted_probs.cumsum(dim=-1)
mask = cumprobs - sorted_probs > top_p
sorted_probs[mask] = 0.0
sorted_probs /= sorted_probs.sum()
next_id = sorted_idx[0, torch.multinomial(sorted_probs[0], 1)]
ids = torch.cat([ids, next_id.view(1, 1)], dim=1)
return ids