moashmawy
/

tinystories-gpt-small

+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import math
+from dataclasses import dataclass
+from contextlib import nullcontext
+from typing import Literal
+class CausalSelfAttention(nn.Module):
+    # A causal self-attention layer that supports both flash attention and standard attention.
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0 # Ensures the embedding dimension can be evenly split across attention heads.
+        # This linear layer projects input x into query (q), key (k), and value (v) vectors —
+        # all at once (so the output is 3× the size).
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # After attention is done, this layer projects the output back to the original embedding size.
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # Dropout applied to the attention weights (probabilities).
+        self.attn_dropout = nn.Dropout(config.dropout)
+        # Dropout applied after the final projection.
+        self.resid_dropout = nn.Dropout(config.dropout)
+        # Store values for easy access later.
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        # Checks whether the efficient Flash Attention API is available in torch.nn.functional.
+        self.flash = hasattr(F, "scaled_dot_product_attention")
+        # If Flash Attention is not available, we create a lower triangular mask to ensure causality.
+        # This mask prevents the model from attending to future tokens in the sequence.
+        if not self.flash:
+            # register_buffer ensures this tensor is saved with the model but not updated by gradients.
+            self.register_buffer(
+                "bias",
+                torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                    1, 1, config.block_size, config.block_size
+                ),
+            )
+    def forward(self, x):
+        B, T, C = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        if self.flash:
+            y = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=None,
+                dropout_p=self.attn_dropout.p if self.training else 0.0,
+                is_causal=True,
+            )
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+# --- User's Original LayerNorm ---
+class LayerNorm(nn.Module):
+    def __init__(self, ndim, bias):
+        """
+        Initializes the LayerNorm module.
+        Args:
+            ndim (int): is the number of features in the last dimension (e.g., embedding size).
+            bias (bool): Whether to include a bias term in the normalization.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, x):
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
+# --- End User's Original LayerNorm ---
+# --- User's Original MLP ---
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu = nn.GELU()
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
+# --- End User's Original MLP ---
+# --- User's Original Block ---
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = LayerNorm(config.n_embd, config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln2 = LayerNorm(config.n_embd, config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+# --- End User's Original Block ---
+# --- User's Original GPTConfig ---
+@dataclass
+class GPTConfig:
+    block_size: int
+    vocab_size: int
+    n_layer: int
+    n_head: int
+    n_embd: int
+    dropout: float = 0.0
+    bias: bool = True
+# --- End User's Original GPTConfig ---
+# --- User's Original TrainingConfig ---
+@dataclass
+class TrainingConfig:
+    learning_rate: float = 1e-4  # more stable training, earlier 1e-4
+    max_iters: int = 20000  # increase from 25000
+    warmup_steps: int = 1000  # smoother initial train, earlier 100
+    min_lr: float = 5e-4  # lower rate, earlier 5e-4
+    eval_iters: int = 500  # increased from 100
+    batch_size: int = 32  # changed from 16, better gradient estimate
+    block_size: int = 128  # changed from 64, capture longer range dependencies
+    gradient_accumulation_steps: int = 32  # reduced from 50
+    device: Literal["cuda", "cpu"] = "cuda" if torch.cuda.is_available() else "cpu"
+    device_type: Literal["cuda", "cpu"] = (
+        "cuda" if "cuda" in device else "cpu"
+    )  # for later use in torch.autocast
+    dtype: Literal["bfloat16", "float16"] = (
+        "bfloat16"
+        if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+        else "float16"
+    )
+    ptdtype: torch.dtype = {
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+    }[dtype]
+    ctx: nullcontext[None] | torch.autocast = (
+        nullcontext()
+        if device_type == "cpu"
+        else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+    )
+# --- End User's Original TrainingConfig ---
+class GPT(nn.Module):
+    """
+    The main GPT model, now with an optional QA head for Question Answering tasks.
+    The QA head will predict start and end token indices of the answer span.
+    """
+    def __init__(self, config, is_qa_model=False):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.is_qa_model = is_qa_model
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        # Language modeling head (for pre-training)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # QA head (for fine-tuning)
+        # This will predict start and end logits for the answer span
+        if self.is_qa_model:
+            self.qa_head = nn.Linear(config.n_embd, 2, bias=False) # 2 outputs: start_logit, end_logit
+        else:
+            self.qa_head = None # No QA head if not a QA model
+        # tie weights
+        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/((2 * config.n_layer)**0.5))
+        # report number of parameters
+        # n_params calculation will differ slightly if QA head is present
+        n_params = sum(p.numel() for p in self.parameters())
+        # For non-embedding count it excludes token embeddings and positional embeddings.
+        non_embedding_params = n_params - self.transformer.wpe.weight.numel()
+        print(f"Number of parameters: {non_embedding_params/1e6:.2f}M (excluding positional embeddings)")
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, input_ids, targets=None, attention_mask=None, token_type_ids=None):
+        device = input_ids.device
+        b, t = input_ids.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(input_ids) # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if self.is_qa_model and self.qa_head is not None:
+            # For QA, we typically use the pooled output or sequence output directly
+            # For extractive QA, we need logits for each token for start/end prediction
+            # The output 'x' is (batch_size, sequence_length, n_embd)
+            logits = self.qa_head(x) # (batch_size, sequence_length, 2)
+            start_logits, end_logits = logits.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1).contiguous() # (batch_size, sequence_length)
+            end_logits = end_logits.squeeze(-1).contiguous()     # (batch_size, sequence_length)
+            if targets is not None:
+                # targets for QA are start_positions and end_positions
+                start_positions, end_positions = targets[:, 0], targets[:, 1]
+                # Apply attention mask to logits for valid tokens
+                if attention_mask is not None:
+                    # Tokens that are part of the context (token_type_ids == 1) should be considered for answers
+                    # and also non-padding tokens (attention_mask == 1)
+                    valid_tokens_mask = (attention_mask == 1) & (token_type_ids == 1)
+                    start_logits = start_logits.masked_fill(~valid_tokens_mask, float('-inf'))
+                    end_logits = end_logits.masked_fill(~valid_tokens_mask, float('-inf'))
+                loss_fct = nn.CrossEntropyLoss(ignore_index=-100) # Use -100 as ignore_index for consistency
+                start_loss = loss_fct(start_logits, start_positions)
+                end_loss = loss_fct(end_logits, end_positions)
+                total_loss = (start_loss + end_loss) / 2
+                return start_logits, end_logits, total_loss
+            return start_logits, end_logits, None # For inference
+        else: # Standard language model for pre-training or text generation
+            if targets is not None:
+                # if we are given some targets (e.g. for training), calculate the loss
+                logits = self.lm_head(x)
+                loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100) # Use -100
+            else:
+                # inference-time mini-optimization: only forward the lm_head on the very last position
+                logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+                loss = None
+            return logits, loss
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Generate tokens given a conditioning sequence.
+        idx: Tensor of shape (B, T)
+        """
+        if self.is_qa_model:
+            print("Warning: generate method is not intended for QA models directly.")
+            print("Please use the QA forward pass for inference and post-processing.")
+            return idx # Or raise an error
+        for _ in range(max_new_tokens):
+            idx_cond = (
+                idx
+                if idx.size(1) <= self.config.block_size
+                else idx[:, -self.config.block_size :]
+            )
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+# The 'config' object for pre-training is also kept here, if it's used by other scripts for its definition
+config = GPTConfig(
+    vocab_size=50257,  # use the tokenizer's vocab size
+    block_size=1024,   # or whatever context size you're training with
+    n_layer=8,
+    n_head=8,
+    n_embd=512,
+    dropout=0.1,
+    bias=True,
+)