yat343
/

nanogpt-tutorial

Model card Files Files and versions

xet

Community

yat343 commited on 14 days ago

Commit

3229f14

verified ·

1 Parent(s): 82cb4ef

Upload train.py

Browse files

Files changed (1) hide show

train.py +247 -0

train.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Step-by-step training script for nano GPT.
+What this script does:
+  1. Load the preprocessed data (train / val tokens)
+  2. Build the GPT model with our config
+  3. Define a batching function that grabs random chunks of text
+  4. Set up an AdamW optimizer with cosine learning-rate schedule
+  5. Train loop: sample batch -> forward -> loss -> backward -> step
+  6. Periodically evaluate on validation set and print metrics
+  7. Save the best model checkpoint
+  8. Generate a sample from the model after training
+"""
+import os
+import math
+import time
+import torch
+# Import our model
+from model import GPT, GPTConfig
+# ---------------------------------------------------------------------------
+# 1. Hyperparameters & Config
+# ---------------------------------------------------------------------------
+# Feel free to tweak these! For a tutorial we keep things small and fast.
+BATCH_SIZE = 64          # how many sequences to process in parallel
+BLOCK_SIZE = 256         # max context length for each sequence (must match model!)
+MAX_ITERS = 5000         # total training steps
+LEARNING_RATE = 1e-3     # starting learning rate
+WARMUP_ITERS = 200       # linear warmup steps (gradually increase LR)
+LR_DECAY_ITERS = 5000    # when to reach min LR (usually = MAX_ITERS)
+MIN_LR = 1e-4            # minimum learning rate at end of cosine schedule
+EVAL_INTERVAL = 500      # how often to run validation
+EVAL_ITERS = 200         # how many val batches to average for a stable loss estimate
+GRAD_CLIP = 1.0          # max gradient norm (prevents exploding gradients)
+# Device selection
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# ---------------------------------------------------------------------------
+# 2. Load Data
+# ---------------------------------------------------------------------------
+# We load the dictionary saved by prepare.py
+data_path = os.path.join(os.path.dirname(__file__), "data.pt")
+data = torch.load(data_path, weights_only=False)
+train_data = data["train"]
+val_data   = data["val"]
+vocab_size = data["vocab_size"]
+chars      = data["chars"]
+stoi       = data["stoi"]
+itos       = data["itos"]
+print(f"Vocab size : {vocab_size}")
+print(f"Train tokens: {len(train_data):,}")
+print(f"Val tokens  : {len(val_data):,}")
+# ---------------------------------------------------------------------------
+# 3. Batch sampling
+# ---------------------------------------------------------------------------
+# For language modeling, each training example is a random contiguous chunk
+# of text. The input is tokens[0:T-1], the target is tokens[1:T].
+def get_batch(split: str):
+    """Sample a single batch from train or val data."""
+    data_split = train_data if split == "train" else val_data
+    ix = torch.randint(len(data_split) - BLOCK_SIZE, (BATCH_SIZE,))
+    x = torch.stack([data_split[i : i + BLOCK_SIZE] for i in ix])
+    y = torch.stack([data_split[i + 1 : i + BLOCK_SIZE + 1] for i in ix])
+    x, y = x.to(device), y.to(device)
+    return x, y
+# ---------------------------------------------------------------------------
+# 4. Helper: Learning-rate schedule (cosine with linear warmup)
+# ---------------------------------------------------------------------------
+# Warmup is crucial for transformers — it prevents early spikes in loss
+# caused by large gradients when the model is still random.
+def get_lr(iteration: int) -> float:
+    if iteration < WARMUP_ITERS:
+        # Linear warmup
+        return LEARNING_RATE * (iteration + 1) / WARMUP_ITERS
+    if iteration > LR_DECAY_ITERS:
+        return MIN_LR
+    # Cosine decay after warmup
+    decay_ratio = (iteration - WARMUP_ITERS) / (LR_DECAY_ITERS - WARMUP_ITERS)
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
+    return MIN_LR + coeff * (LEARNING_RATE - MIN_LR)
+# ---------------------------------------------------------------------------
+# 5. Model Setup
+# ---------------------------------------------------------------------------
+# We match block_size to our training hyperparameter above.
+# For tiny Shakespeare, even a 4-layer model can learn structure.
+config = GPTConfig(
+    block_size=BLOCK_SIZE,
+    vocab_size=vocab_size,
+    n_layer=6,       # deeper = more capacity to learn patterns
+    n_head=6,
+    n_embd=384,
+    dropout=0.0,
+)
+model = GPT(config)
+model.to(device)
+# Count parameters
+param_count = sum(p.numel() for p in model.parameters())
+print(f"\nModel config: {config}")
+print(f"Total parameters: {param_count / 1e6:.2f} M")
+# ---------------------------------------------------------------------------
+# 6. Optimizer
+# ---------------------------------------------------------------------------
+# We separate parameters that should get weight decay (2D weights)
+# from those that should not (1D biases, LayerNorm scales).
+# This is standard practice and slightly improves training.
+decay_params = []
+no_decay_params = []
+for name, param in model.named_parameters():
+    if param.dim() >= 2:
+        decay_params.append(param)
+    else:
+        no_decay_params.append(param)
+optim_groups = [
+    {"params": decay_params, "weight_decay": 0.1},
+    {"params": no_decay_params, "weight_decay": 0.0},
+]
+optimizer = torch.optim.AdamW(optim_groups, lr=LEARNING_RATE, betas=(0.9, 0.95), eps=1e-8)
+# ---------------------------------------------------------------------------
+# 7. Evaluation helper
+# ---------------------------------------------------------------------------
+# We average the loss over multiple validation batches for a stable estimate.
+# torch.no_grad() disables gradient computation -> faster and less memory.
+@torch.no_grad()
+def estimate_loss():
+    out = {}
+    model.eval()  # set model to evaluation mode
+    for split in ["train", "val"]:
+        losses = torch.zeros(EVAL_ITERS)
+        for k in range(EVAL_ITERS):
+            xb, yb = get_batch(split)
+            _, loss = model(xb, yb)
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()  # set model back to training mode
+    return out
+# ---------------------------------------------------------------------------
+# 8. Training Loop
+# ---------------------------------------------------------------------------
+print("\n" + "=" * 60)
+print("Starting training...")
+print("=" * 60)
+best_val_loss = float("inf")
+start_time = time.time()
+for iter_num in range(MAX_ITERS):
+    # --- Learning rate scheduling ---
+    lr = get_lr(iter_num)
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+    # --- Periodic evaluation ---
+    if iter_num % EVAL_INTERVAL == 0 or iter_num == MAX_ITERS - 1:
+        losses = estimate_loss()
+        elapsed = time.time() - start_time
+        print(
+            f"step {iter_num:5d} | "
+            f"train loss {losses['train']:.4f} | "
+            f"val loss {losses['val']:.4f} | "
+            f"lr {lr:.2e} | "
+            f"time {elapsed:.1f}s"
+        )
+        # Save the best checkpoint
+        if losses["val"] < best_val_loss:
+            best_val_loss = losses["val"]
+            checkpoint_path = os.path.join(os.path.dirname(__file__), "best.pt")
+            torch.save({
+                "model_state_dict": model.state_dict(),
+                "config": config,
+                "vocab_size": vocab_size,
+                "chars": chars,
+                "stoi": stoi,
+                "itos": itos,
+            }, checkpoint_path)
+            print(f"  -> Saved new best model (val_loss={best_val_loss:.4f})")
+    # --- Training step ---
+    xb, yb = get_batch("train")
+    # Forward
+    logits, loss = model(xb, yb)
+    # Backward
+    optimizer.zero_grad(set_to_none=True)
+    loss.backward()
+    # Gradient clipping (prevents exploding gradients)
+    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
+    # Optimizer step
+    optimizer.step()
+# ---------------------------------------------------------------------------
+# 9. Final evaluation
+# ---------------------------------------------------------------------------
+losses = estimate_loss()
+print(f"\nFinal -> train loss {losses['train']:.4f} | val loss {losses['val']:.4f}")
+# ---------------------------------------------------------------------------
+# 10. Generate text from the trained model
+# ---------------------------------------------------------------------------
+print("\n" + "=" * 60)
+print("Generating sample text...")
+print("=" * 60)
+model.eval()
+# Start from a newline character (index of '\n' in our vocab)
+start_token = stoi["\n"]
+context = torch.zeros((1, 1), dtype=torch.long, device=device)
+context[0, 0] = start_token
+with torch.no_grad():
+    generated = model.generate(context, max_new_tokens=500, temperature=1.0, top_k=40)
+# Rebuild decode function from saved mappings
+decode = lambda l: "".join([itos[i] for i in l])
+# Decode to text
+print("\n--- Generated text ---\n")
+print(decode(generated[0].tolist()))
+print("\n--- End ---")
+print("\nTraining complete! Best checkpoint saved to: best.pt")