geeteshcodes commited on 3 days ago

Commit

7f974df

verified ·

1 Parent(s): 80e2a42

Initial commit

Browse files

Files changed (41) hide show

.gitignore +44 -0
README.md +242 -0
data/dataloader.py +223 -0
finetune/README.md +137 -0
finetune/__init__.py +1 -0
finetune/chat.py +296 -0
finetune/check_data.py +269 -0
finetune/data/meta.json +13 -0
finetune/data/tokenizer.json +0 -0
finetune/data/tokenizer_config.json +17 -0
finetune/prepare_data.py +303 -0
finetune/sft_dataset.py +103 -0
finetune/sft_train.py +563 -0
model/__init__.py +5 -0
model/attention.py +114 -0
model/block.py +84 -0
model/config.py +110 -0
model/mlp.py +83 -0
model/model.py +245 -0
model/norm.py +65 -0
model/rope.py +172 -0
model_explained.md +376 -0
plot_training.py +370 -0
requirements.txt +20 -0
run.md +34 -0
test_chatmodel.py +366 -0
test_checkpoint.py +290 -0
tokenizer/bpe.py +134 -0
tokenizer/fineweb_edu_tokenizer.json +0 -0
tokenizer/fineweb_edu_tokenizer/special_tokens_map.json +5 -0
tokenizer/fineweb_edu_tokenizer/tokenizer.json +0 -0
tokenizer/fineweb_edu_tokenizer/tokenizer_config.json +11 -0
tokenizer/normalizer.py +42 -0
tokenizer/post_processor.py +152 -0
tokenizer/pretokenizer.py +159 -0
tokenizer/tempCodeRunnerFile.py +5 -0
tokenizer/tokenize_dataset.py +389 -0
tokenizer/traintokenizer.py +207 -0
tokenizer/wrap_tokenizer.py +232 -0
tokenizer_walkthrough.md +105 -0
train.py +485 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,44 @@

+# ── Checkpoints & training runs ──────────────────────────────────────
+runs/
+# ── Python ───────────────────────────────────────────────────────────
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+# ── Virtual environments ──────────────────────────────────────────────
+.env
+.venv
+env/
+venv/
+# ── Jupyter ───────────────────────────────────────────────────────────
+.ipynb_checkpoints/
+*.ipynb
+# ── Data / binaries ──────────────────────────────────────────────────
+*.bin
+*.pt
+*.pth
+*.safetensors
+*.npy
+*.npz
+# ── Logs ─────────────────────────────────────────────────────────────
+*.log
+*.jsonl
+# ── OS ───────────────────────────────────────────────────────────────
+.DS_Store
+Thumbs.db
+# ── IDE ───────────────────────────────────────────────────────────────
+.vscode/
+.idea/
+*.swp

README.md ADDED Viewed

	@@ -0,0 +1,242 @@

+# SLLM — Small Language Model from Scratch
+A GPT-style decoder-only transformer built and trained from scratch in PyTorch. Two model sizes are available (100M and 150M parameters), designed to fit on consumer GPUs as small as a 4 GB VRAM card (e.g. RTX 3050).
+---
+## ✨ Features
+- **Architecture**: Decoder-only transformer (GPT-style) with modern improvements
+  - RMSNorm instead of LayerNorm (faster, no bias)
+  - RoPE (Rotary Position Embeddings) — used in LLaMA, Mistral, Gemma
+  - SwiGLU feed-forward network — outperforms GELU at the same parameter count
+  - Flash Attention via `F.scaled_dot_product_attention` (O(T²) memory avoided)
+  - Weight-tied token embeddings + LM head (saves ~32M parameters)
+- **Training**
+  - bf16 mixed-precision with gradient accumulation
+  - Gradient checkpointing for low-VRAM GPUs
+  - Cosine LR schedule with linear warmup
+  - Resumable checkpointing (`--resume`, `--extra_steps`)
+  - JSONL metric logging + live training dashboard
+- **Custom BPE Tokenizer** — trained on FineWeb-Edu with byte fallback (zero OOV)
+- **Supervised Fine-Tuning (SFT)** — chat model pipeline included in `finetune/`
+---
+## 🏗️ Project Structure
+```
+sllm/
+├── model/                   # Model architecture
+│   ├── config.py            # ModelConfig dataclass (SLLM_100M, SLLM_150M presets)
+│   ├── model.py             # SLLM — full model assembly, weight init, gradient checkpointing
+│   ├── block.py             # TransformerBlock (pre-norm, residual)
+│   ├── attention.py         # Causal multi-head self-attention + RoPE
+│   ├── mlp.py               # SwiGLU feed-forward network
+│   ├── norm.py              # RMSNorm
+│   └── rope.py              # Rotary Position Embeddings
+│
+├── tokenizer/               # Custom BPE tokenizer
+│   ├── normalizer.py        # HTML stripping, unicode NFC, whitespace cleanup
+│   ├── pretokenizer.py      # Regex pre-tokenizer (code-aware, contraction-aware)
+│   ├── bpe.py               # BPE model config with byte fallback (32k vocab)
+│   ├── traintokenizer.py    # Train on FineWeb-Edu stream
+│   ├── post_processor.py    # Append <|endoftext|> to every sequence
+│   ├── wrap_tokenizer.py    # Wrap into PreTrainedTokenizerFast
+│   └── tokenize_dataset.py  # Pack tokens into flat binary .bin shards
+│
+├── data/
+│   └── dataloader.py        # Memory-mapped shard dataloader
+│
+├── finetune/                # Supervised fine-tuning (SFT) pipeline
+│   ├── prepare_data.py      # Prepare chat data
+│   ├── sft_train.py         # SFT training loop
+│   ├── sft_dataset.py       # Chat dataset
+│   └── chat.py              # Interactive chat with the fine-tuned model
+│
+├── train.py                 # Pre-training loop
+├── plot_training.py         # Training dashboard (static + live mode)
+├── requirements.txt
+├── model_explained.md       # Deep-dive into every model component
+└── tokenizer_walkthrough.md # Tokenizer design and pipeline walkthrough
+```
+---
+## 📐 Model Configs
+| Config     | d_model | Heads | Layers | Parameters |
+|------------|---------|-------|--------|------------|
+| `SLLM_100M` | 768    | 12    | 12     | ~109.5M    |
+| `SLLM_150M` | 1024   | 16    | 9      | ~148.4M    |
+Both configs use:
+- Context length: **1024 tokens**
+- Vocab size: **32,000** (custom BPE)
+- SwiGLU d_ff: computed as `round_up_256(⌊2/3 × 4 × d_model⌋)`
+---
+## ⚙️ Installation
+**Requires:** Python 3.10+, PyTorch 2.3+, CUDA-capable GPU (bf16 recommended)
+```bash
+# Create and activate a conda environment
+conda create -n pytorch python=3.11
+conda activate pytorch
+# Install dependencies
+pip install -r requirements.txt
+```
+---
+## 🚀 Training
+### Start a new run (RTX 3050 4GB recommended settings)
+```bash
+python train.py \
+  --config 150M \
+  --data_dir tokenizer/data \
+  --batch_size 2 \
+  --grad_accum 16 \
+  --grad_checkpoint \
+  --dtype bf16 \
+  --max_steps 5000 \
+  --run_dir runs/sllm_150m \
+  --log_every 10 \
+  --save_every 500 \
+  --val_every 500 \
+  --warmup_steps 200
+```
+### Resume from a checkpoint
+```bash
+python train.py \
+  --resume \
+  --run_dir runs/sllm_150m \
+  --extra_steps 5000 \
+  --data_dir tokenizer/data \
+  --batch_size 2 \
+  --grad_accum 16 \
+  --grad_checkpoint \
+  --dtype bf16
+```
+### Key training flags
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--config` | `100M` | Model size (`100M` or `150M`) |
+| `--batch_size` | `4` | Per-device micro-batch size |
+| `--grad_accum` | `8` | Gradient accumulation steps |
+| `--max_steps` | unlimited | Absolute step target |
+| `--extra_steps` | — | Run N more steps from current checkpoint |
+| `--resume` | — | Resume from latest checkpoint in `--run_dir` |
+| `--grad_checkpoint` | — | Enable gradient checkpointing (saves VRAM) |
+| `--dtype` | `bf16` | Mixed precision dtype (`fp32`, `fp16`, `bf16`) |
+| `--synthetic` | — | Use random data (for testing without real shards) |
+---
+## 📊 Training Dashboard
+Visualize training metrics in a dark-mode 6-panel dashboard:
+```bash
+# Static plot
+python plot_training.py --run_dir runs/sllm_150m
+# Live mode — refresh every 30 seconds while training
+python plot_training.py --run_dir runs/sllm_150m --live --interval 30
+# Compare two runs
+python plot_training.py --run_dir runs/run_a runs/run_b
+# Save to file
+python plot_training.py --run_dir runs/sllm_150m --save dashboard.png
+```
+**Dashboard panels:** Training Loss (raw + EMA) · Validation Loss · Learning Rate · Tokens/sec · VRAM usage · Gradient norm
+---
+## 💬 Fine-Tuning (Chat Model)
+After pre-training, you can fine-tune with supervised instruction data:
+```bash
+# 1. Prepare chat data
+python finetune/prepare_data.py
+# 2. Fine-tune
+python finetune/sft_train.py \
+  --base_ckpt runs/sllm_150m/ckpt_0011500.pt \
+  --run_dir runs/sllm_150m_chat \
+  --max_steps 2500 \
+  --batch_size 4 \
+  --grad_accum 8 \
+  --grad_checkpoint
+# 3. Chat interactively
+python finetune/chat.py --run_dir runs/sllm_150m_chat
+```
+---
+## 🔡 Tokenizer
+A custom BPE tokenizer trained on the educational subset of [FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu):
+- **32,000 token vocabulary**
+- **Byte fallback** — zero out-of-vocabulary tokens (even math symbols and emojis work)
+- **Code-aware** — preserves `snake_case`, operators (`==`, `->`, `**`), and indentation
+- **Contraction-aware** — `don't`, `I've`, `they're` are split correctly
+- Packaged as a `PreTrainedTokenizerFast` (HuggingFace-compatible)
+Training data is packed into flat binary `.bin` shards (`np.uint16`, 100M tokens each) for fast memory-mapped loading.
+See [`tokenizer_walkthrough.md`](tokenizer_walkthrough.md) for a full pipeline deep-dive.
+---
+## 🧠 Architecture Deep-Dive
+See [`model_explained.md`](model_explained.md) for a plain-language walkthrough of every model component, including:
+- Why RMSNorm is faster than LayerNorm
+- How RoPE encodes relative position without extra parameters
+- Why SwiGLU outperforms GELU
+- How weight tying saves 32M parameters
+- Flash Attention and gradient checkpointing explained
+---
+## 📋 Checkpoints & Logging
+- Checkpoints are saved to `<run_dir>/ckpt_NNNNNNN.pt` every `--save_every` steps and on clean exit (Ctrl+C)
+- Metrics are appended to `<run_dir>/train_log.jsonl` (one JSON line per log step)
+- Each checkpoint stores: model weights, optimizer state, step number, loss, and config name
+- Resuming auto-detects the correct model config from the checkpoint
+---
+## 📦 Requirements
+```
+torch>=2.3.0
+datasets>=2.14.0       # HuggingFace datasets (streaming)
+tokenizers>=0.15.0     # Fast BPE tokenizer
+transformers>=4.40.0   # PreTrainedTokenizerFast
+numpy>=1.26.0
+tqdm
+matplotlib
+```
+---
+## 📄 License
+This project is released for educational purposes.

data/dataloader.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+data/dataloader.py
+Streaming dataloader for the pre-tokenized binary shards produced by
+tokenizer/tokenize_dataset.py.
+Each shard is a flat binary file of np.uint16 token IDs.
+100M tokens * 2 bytes = ~200MB per shard.
+Strategy:
+  1. Discover all shards matching split name (train/val).
+  2. Shuffle shard order at start of each epoch.
+  3. For each shard, load it (memmap or full) and yield non-overlapping
+     chunks of (context_length + 1) tokens.
+  4. Inputs  = chunk[:-1]  (length context_length)
+     Targets = chunk[1:]   (length context_length, shifted right by 1)
+When no data shards exist yet (tokenization not done), a SyntheticShard
+can be used for architecture testing.
+"""
+import os
+import glob
+import random
+import numpy as np
+import torch
+from torch.utils.data import IterableDataset, DataLoader
+# ------------------------------------------------------------------ #
+#  SHARD DISCOVERY
+# ------------------------------------------------------------------ #
+def find_shards(data_dir: str, split: str) -> list[str]:
+    """
+    Returns sorted list of shard paths for the given split.
+    Args:
+        data_dir : directory containing .bin shard files
+        split    : 'train' or 'val'
+    """
+    pattern = os.path.join(data_dir, f"{split}_*.bin")
+    shards  = sorted(glob.glob(pattern))
+    return shards
+# ------------------------------------------------------------------ #
+#  ITERABLE DATASET
+# ------------------------------------------------------------------ #
+class ShardedTokenDataset(IterableDataset):
+    """
+    IterableDataset that streams token chunks from binary shards.
+    Each worker processes a disjoint subset of shards so we get
+    proper parallelism with DataLoader(num_workers=N).
+    Usage:
+        dataset = ShardedTokenDataset(data_dir, split='train', context_length=1024)
+        loader  = DataLoader(dataset, batch_size=4)
+        for input_ids, targets in loader:
+            ...
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        split: str,
+        context_length: int,
+        shuffle_shards: bool = True,
+    ):
+        """
+        Args:
+            data_dir       : path to directory with .bin shard files
+            split          : 'train' or 'val'
+            context_length : sequence length (model context length)
+            shuffle_shards : shuffle shard order each epoch (train only)
+        """
+        super().__init__()
+        self.context_length  = context_length
+        self.shuffle_shards  = shuffle_shards
+        self.shards = find_shards(data_dir, split)
+        if not self.shards:
+            raise FileNotFoundError(
+                f"No {split} shards found in {data_dir}.\n"
+                f"Run tokenizer/tokenize_dataset.py first to generate data."
+            )
+        print(f"[DataLoader] Found {len(self.shards)} {split} shards in {data_dir}")
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        shards = self.shards.copy()
+        if self.shuffle_shards:
+            random.shuffle(shards)
+        # Split shards across workers
+        if worker_info is not None:
+            shards = shards[worker_info.id :: worker_info.num_workers]
+        chunk = self.context_length + 1  # +1 so we can shift for targets
+        for shard_path in shards:
+            # Load shard as uint16 array
+            tokens = np.fromfile(shard_path, dtype=np.uint16).astype(np.int32)
+            # Yield non-overlapping chunks
+            n_chunks = len(tokens) // chunk
+            for i in range(n_chunks):
+                start  = i * chunk
+                seq    = torch.from_numpy(tokens[start : start + chunk].copy())
+                input_ids = seq[:-1].long()   # (context_length,)
+                targets   = seq[1:].long()    # (context_length,)
+                yield input_ids, targets
+# ------------------------------------------------------------------ #
+#  SYNTHETIC DATASET (for testing without real data)
+# ------------------------------------------------------------------ #
+class SyntheticDataset(IterableDataset):
+    """
+    Generates random token sequences for architecture testing.
+    Use when real shards are not yet available.
+    """
+    def __init__(self, vocab_size: int, context_length: int, n_batches: int = 1000):
+        super().__init__()
+        self.vocab_size     = vocab_size
+        self.context_length = context_length
+        self.n_batches      = n_batches
+    def __iter__(self):
+        for _ in range(self.n_batches):
+            seq       = torch.randint(0, self.vocab_size, (self.context_length + 1,))
+            input_ids = seq[:-1]
+            targets   = seq[1:]
+            yield input_ids, targets
+# ------------------------------------------------------------------ #
+#  FACTORY FUNCTION
+# ------------------------------------------------------------------ #
+def build_dataloader(
+    data_dir: str,
+    split: str,
+    context_length: int,
+    batch_size: int,
+    num_workers: int = 2,
+    use_synthetic: bool = False,
+    vocab_size: int = 32_000,
+) -> DataLoader:
+    """
+    Builds and returns a DataLoader for the given split.
+    Falls back to SyntheticDataset if use_synthetic=True or no shards found.
+    Args:
+        data_dir       : directory with .bin shards
+        split          : 'train' or 'val'
+        context_length : model context length (1024)
+        batch_size     : number of sequences per batch
+        num_workers    : DataLoader workers (0 = main process)
+        use_synthetic  : force synthetic data (for testing)
+        vocab_size     : needed for synthetic fallback
+    Returns:
+        DataLoader yielding (input_ids, targets) each of shape (B, T)
+    """
+    if use_synthetic:
+        dataset = SyntheticDataset(vocab_size, context_length)
+        print(f"[DataLoader] Using synthetic data (use_synthetic=True)")
+    else:
+        try:
+            dataset = ShardedTokenDataset(
+                data_dir       = data_dir,
+                split          = split,
+                context_length = context_length,
+                shuffle_shards = (split == "train"),
+            )
+        except FileNotFoundError as e:
+            print(f"[DataLoader] WARNING: {e}")
+            print(f"[DataLoader] Falling back to synthetic data for testing.")
+            dataset = SyntheticDataset(vocab_size, context_length)
+    return DataLoader(
+        dataset,
+        batch_size  = batch_size,
+        num_workers = num_workers,
+        pin_memory  = True,     # faster CPU->GPU transfer
+    )
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    import sys
+    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from model.config import SLLM_100M
+    cfg = SLLM_100M
+    print("Testing with synthetic data...")
+    loader = build_dataloader(
+        data_dir       = "tokenizer/data",
+        split          = "train",
+        context_length = cfg.context_length,
+        batch_size     = 4,
+        num_workers    = 0,
+        use_synthetic  = True,
+        vocab_size     = cfg.vocab_size,
+    )
+    for i, (x, y) in enumerate(loader):
+        print(f"Batch {i}: input_ids={x.shape}, targets={y.shape}, dtype={x.dtype}")
+        if i == 3:
+            break
+    print("DataLoader OK")

finetune/README.md ADDED Viewed

	@@ -0,0 +1,137 @@

+# SLLM-150M → Chat Model (SFT)
+Supervised Fine-Tuning pipeline to turn the pretrained **SLLM-150M** base model into
+an instruction-following chat model using **OpenHermes-2.5**.
+## Pipeline
+```
+Base model  (runs/sllm_150m/ckpt_0011500.pt)
+      │
+      ▼
+prepare_data.py   ─── download & tokenize OpenHermes-2.5 (80k convs)
+      │
+      ▼
+sft_train.py      ─── SFT with ChatML loss masking
+      │
+      ▼
+chat.py           ─── interactive CLI chat
+```
+## Step 1 — Install dependency
+```bash
+pip install datasets
+```
+## Step 2 — Prepare data
+Downloads 80k conversations, formats as ChatML, tokenizes, saves shards.
+Also saves the extended tokenizer (vocab 32,002) to `finetune/data/`.
+```bash
+python finetune/prepare_data.py
+```
+Options:
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--n_samples` | `80000` | Conversations to sample |
+| `--val_ratio` | `0.05` | Validation fraction |
+| `--output_dir` | `finetune/data` | Output directory |
+| `--seed` | `42` | Random seed |
+Expected output:
+```
+finetune/data/
+  tokenizer.json          ← extended tokenizer (32,002 vocab)
+  tokenizer_config.json
+  special_tokens_map.json
+  train_sft.pt            ← ~76,000 examples
+  val_sft.pt              ← ~4,000 examples
+  meta.json               ← stats
+```
+## Step 3 — Fine-tune
+```bash
+python finetune/sft_train.py \
+  --base_ckpt runs/sllm_150m/ckpt_0011500.pt \
+  --run_dir   runs/sllm_150m_chat \
+  --max_steps 2000 \
+  --batch_size 4 --grad_accum 8 \
+  --grad_checkpoint
+```
+For an RTX 3050 4 GB, these settings use ~3.5 GB VRAM and take **~5–8 minutes**.
+**Resume training:**
+```bash
+python finetune/sft_train.py \
+  --resume --run_dir runs/sllm_150m_chat \
+  --extra_steps 1000
+```
+Key options:
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--base_ckpt` | `runs/sllm_150m/ckpt_0011500.pt` | Base pretrained checkpoint |
+| `--max_lr` | `1e-5` | Peak LR (10× lower than pretraining) |
+| `--dropout` | `0.1` | SFT dropout (0 in pretraining) |
+| `--max_steps` | `2000` | Total training steps |
+| `--grad_checkpoint` | off | Enable for lower VRAM |
+Checkpoints are saved to `runs/sllm_150m_chat/ckpt_sft_XXXXXXX.pt`.
+Training log: `runs/sllm_150m_chat/sft_log.jsonl`.
+## Step 4 — Chat
+```bash
+python finetune/chat.py
+python finetune/chat.py --run_dir runs/sllm_150m_chat --temperature 0.7
+```
+In-chat commands:
+| Command | Effect |
+|---------|--------|
+| `/reset` | Clear conversation history |
+| `/system <text>` | Change system prompt |
+| `/quit` | Exit |
+## What changes vs pretraining
+| | Pretraining (`train.py`) | SFT (`sft_train.py`) |
+|---|---|---|
+| Data | Raw text shards (`.bin`) | ChatML conversations (`.pt`) |
+| Loss | Every token | **Assistant tokens only** (`ignore_index=-100`) |
+| Learning rate | `3e-4` | **`1e-5`** |
+| Warmup | 100 steps | 30 steps |
+| Vocab | 32,000 | **32,002** (`<\|im_start\|>` + `<\|im_end\|>`) |
+| Dropout | 0.0 | **0.1** |
+| Checkpoint prefix | `ckpt_` | `ckpt_sft_` |
+## Expected loss curve
+| Stage | Expected loss |
+|-------|--------------|
+| Start (step 0) | 1.5 – 2.5 |
+| Step 500 | 1.0 – 1.5 |
+| Step 2000 | 0.8 – 1.2 |
+> **If loss starts above 4.0 or goes NaN** → reduce `--max_lr` to `5e-6`.
+## Prompt format (ChatML)
+```
+<|im_start|>system
+You are a helpful, concise assistant.<|im_end|>
+<|im_start|>user
+What is the capital of France?<|im_end|>
+<|im_start|>assistant
+The capital of France is Paris.<|im_end|>
+```
+Generation stops automatically when the model produces `<|im_end|>`.

finetune/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # finetune package

finetune/chat.py ADDED Viewed

	@@ -0,0 +1,296 @@

+"""
+finetune/chat.py
+Interactive CLI chat with the fine-tuned SLLM-150M chat model.
+Loads the latest SFT checkpoint from --run_dir, formats your input
+as a ChatML prompt, generates a response token-by-token, and stops
+at the <|im_end|> token.
+Usage:
+    python finetune/chat.py
+    python finetune/chat.py --run_dir runs/sllm_150m_chat
+    python finetune/chat.py --temperature 0.7 --top_k 40
+In-chat commands:
+    /reset          clear conversation history (start fresh)
+    /system <text>  change the system prompt
+    /quit           exit
+"""
+import os
+import sys
+import argparse
+from pathlib import Path
+import torch
+import torch.nn as nn
+from transformers import PreTrainedTokenizerFast
+SCRIPT_DIR   = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+DATA_DIR     = SCRIPT_DIR / "data"
+sys.path.insert(0, str(PROJECT_ROOT))
+from model.config import SLLM_150M
+from model.model  import SLLM
+DEFAULT_SYSTEM  = "You are a helpful, concise assistant."
+DEFAULT_RUN_DIR = str(PROJECT_ROOT / "runs" / "sllm_150m_chat")
+# ------------------------------------------------------------------ #
+#  HELPERS
+# ------------------------------------------------------------------ #
+def find_latest_ckpt(run_dir: str) -> str:
+    """Returns path to the most recent ckpt_sft_*.pt in run_dir."""
+    ckpts = sorted([
+        f for f in os.listdir(run_dir)
+        if f.startswith("ckpt_sft_") and f.endswith(".pt")
+    ])
+    if not ckpts:
+        raise FileNotFoundError(
+            f"No SFT checkpoints found in '{run_dir}'.\n"
+            f"Run sft_train.py first."
+        )
+    return os.path.join(run_dir, ckpts[-1])
+def resize_token_embeddings(model: SLLM, new_vocab_size: int):
+    """Same resize logic as sft_train.py — kept local to avoid circular imports."""
+    old_size = model.config.vocab_size
+    if new_vocab_size == old_size:
+        return
+    d_model    = model.config.d_model
+    device     = model.token_emb.weight.device
+    dtype      = model.token_emb.weight.dtype
+    old_weight = model.token_emb.weight.data.clone()
+    mean_vec   = old_weight.mean(dim=0)
+    new_weight = torch.zeros(new_vocab_size, d_model, dtype=dtype, device=device)
+    new_weight[:old_size] = old_weight
+    new_weight[old_size:] = mean_vec.unsqueeze(0).expand(new_vocab_size - old_size, -1)
+    new_emb = nn.Embedding(new_vocab_size, d_model).to(device=device, dtype=dtype)
+    new_emb.weight.data = new_weight
+    model.token_emb = new_emb
+    model.lm_head.weight = model.token_emb.weight
+    model.config.vocab_size = new_vocab_size
+def load_model_and_tokenizer(run_dir: str, device: torch.device):
+    """Loads tokenizer (from data dir) and fine-tuned model (from run_dir)."""
+    # ---- Tokenizer ------------------------------------------------- #
+    tok_path = str(DATA_DIR)
+    if os.path.exists(os.path.join(tok_path, "tokenizer.json")):
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(tok_path)
+    else:
+        # Fallback: base tokenizer + manual special token add
+        base_dir  = str(PROJECT_ROOT / "tokenizer" / "fineweb_edu_tokenizer")
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(base_dir)
+        tokenizer.add_special_tokens({
+            "additional_special_tokens": ["<|im_start|>", "<|im_end|>"]
+        })
+    # ---- Checkpoint ------------------------------------------------ #
+    ckpt_path = find_latest_ckpt(run_dir)
+    ckpt      = torch.load(ckpt_path, map_location=device, weights_only=False)
+    # ---- Model ----------------------------------------------------- #
+    model = SLLM(SLLM_150M).to(device)
+    saved_vocab = ckpt.get("vocab_size", len(tokenizer))
+    resize_token_embeddings(model, saved_vocab)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model.eval()
+    return model, tokenizer, ckpt_path, ckpt.get("step", "?"), ckpt.get("loss", float("nan"))
+# ------------------------------------------------------------------ #
+#  PROMPT BUILDING
+# ------------------------------------------------------------------ #
+def build_prompt(history: list[dict], system_prompt: str,
+                 tokenizer: PreTrainedTokenizerFast) -> torch.Tensor:
+    """
+    Formats conversation history as ChatML and tokenises it.
+    Template:
+        <|im_start|>system
+        {system}<|im_end|>
+        <|im_start|>user
+        {user}<|im_end|>
+        <|im_start|>assistant
+        {assistant}<|im_end|>
+        ...
+        <|im_start|>assistant\\n   ← left open for the model to complete
+    Returns:
+        input_ids : (1, T) LongTensor
+    """
+    text = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    for turn in history:
+        text += f"<|im_start|>{turn['role']}\n{turn['content']}<|im_end|>\n"
+    # Prime the model to generate as assistant
+    text += "<|im_start|>assistant\n"
+    ids = tokenizer.encode(text, add_special_tokens=False)
+    return torch.tensor([ids], dtype=torch.long)
+# ------------------------------------------------------------------ #
+#  GENERATION
+# ------------------------------------------------------------------ #
+@torch.no_grad()
+def generate_response(
+    model:          SLLM,
+    input_ids:      torch.Tensor,
+    tokenizer:      PreTrainedTokenizerFast,
+    max_new_tokens: int   = 300,
+    temperature:    float = 0.8,
+    top_k:          int   = 50,
+    device:         torch.device = None,
+) -> str:
+    """
+    Autoregressively generates tokens until:
+      - <|im_end|> is produced (clean stop), or
+      - eos_token_id is produced, or
+      - max_new_tokens is reached
+    Returns the decoded response string (special tokens stripped).
+    """
+    im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    eos_id    = tokenizer.eos_token_id
+    ids       = input_ids.to(device)
+    generated = []
+    for _ in range(max_new_tokens):
+        # Crop to context window
+        ctx = ids if ids.shape[1] <= model.config.context_length \
+                  else ids[:, -model.config.context_length:]
+        logits, _ = model(ctx)                           # (1, T, V)
+        logits    = logits[:, -1, :] / max(temperature, 1e-8)
+        # Top-k filtering
+        if top_k and top_k > 0:
+            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+            logits[logits < v[:, [-1]]] = float("-inf")
+        probs      = torch.softmax(logits, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)   # (1, 1)
+        tok_id     = next_token.item()
+        # Stop conditions
+        if tok_id == im_end_id or tok_id == eos_id:
+            break
+        generated.append(tok_id)
+        ids = torch.cat([ids, next_token], dim=1)
+    return tokenizer.decode(generated, skip_special_tokens=True).strip()
+# ------------------------------------------------------------------ #
+#  MAIN
+# ------------------------------------------------------------------ #
+def parse_args():
+    p = argparse.ArgumentParser(description="SLLM-150M Chat")
+    p.add_argument("--run_dir",        type=str,   default=DEFAULT_RUN_DIR)
+    p.add_argument("--temperature",    type=float, default=0.8,
+                   help="Sampling temperature (lower = more focused)")
+    p.add_argument("--top_k",          type=int,   default=50,
+                   help="Top-k sampling (0 = disabled)")
+    p.add_argument("--max_new_tokens", type=int,   default=300,
+                   help="Max tokens per assistant response")
+    p.add_argument("--system",         type=str,   default=DEFAULT_SYSTEM,
+                   help="System prompt")
+    return p.parse_args()
+def main():
+    args   = parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("\n" + "=" * 60)
+    print("  SLLM-150M  Chat")
+    print("=" * 60)
+    print(f"  Device : {device}")
+    if device.type == "cuda":
+        print(f"  GPU    : {torch.cuda.get_device_name(0)}")
+    # ---- Load ------------------------------------------------------ #
+    print("\nLoading model...")
+    model, tokenizer, ckpt_path, step, loss = load_model_and_tokenizer(args.run_dir, device)
+    print(f"  Checkpoint : {ckpt_path}")
+    print(f"  Step       : {step}   Loss: {loss:.4f}")
+    print(f"  Vocab size : {len(tokenizer):,}")
+    # ---- Chat loop ------------------------------------------------- #
+    system_prompt = args.system
+    history: list[dict] = []
+    print(f"\n  System : {system_prompt}")
+    print("  Commands: /reset  |  /system <new prompt>  |  /quit")
+    print("─" * 60 + "\n")
+    while True:
+        try:
+            user_input = input("You: ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print("\nBye!")
+            break
+        if not user_input:
+            continue
+        # ---- Commands ---------------------------------------------- #
+        if user_input.lower() in ("/quit", "/exit", "quit", "exit"):
+            print("Bye!")
+            break
+        if user_input.lower() == "/reset":
+            history = []
+            print("  [Conversation cleared]\n")
+            continue
+        if user_input.lower().startswith("/system "):
+            new_sys = user_input[8:].strip()
+            if new_sys:
+                system_prompt = new_sys
+                history       = []
+                print(f"  [System prompt updated. Conversation cleared.]\n")
+            continue
+        # ---- Build prompt ------------------------------------------ #
+        history.append({"role": "user", "content": user_input})
+        input_ids = build_prompt(history, system_prompt, tokenizer)
+        # Trim history if prompt is getting close to context limit
+        while input_ids.shape[1] > model.config.context_length - args.max_new_tokens - 10:
+            if len(history) > 2:
+                history = history[2:]   # drop oldest user+assistant pair
+                input_ids = build_prompt(history, system_prompt, tokenizer)
+            else:
+                break   # can't trim further — just truncate in generation
+        # ---- Generate ---------------------------------------------- #
+        print("SLLM: ", end="", flush=True)
+        response = generate_response(
+            model, input_ids, tokenizer,
+            max_new_tokens = args.max_new_tokens,
+            temperature    = args.temperature,
+            top_k          = args.top_k,
+            device         = device,
+        )
+        print(response + "\n")
+        history.append({"role": "assistant", "content": response})
+if __name__ == "__main__":
+    main()

finetune/check_data.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+finetune/check_data.py
+Smoke-test: loads 5 rows from OpenHermes-2.5, runs them through the
+same format_and_tokenize() logic used by prepare_data.py, and prints
+a full visual audit so you can confirm everything lines up.
+Checks:
+  1. Raw conversation structure from the dataset
+  2. ChatML text that gets fed to the tokenizer
+  3. Token IDs and decoded tokens (side-by-side)
+  4. Label mask — ✓ (labeled) vs  (masked -100) for every token
+  5. Label ratio (should be ~30-60% assistant tokens)
+Run from project root:
+    python finetune/check_data.py
+    python finetune/check_data.py --row 3    # inspect a specific row index
+"""
+import sys
+import argparse
+from pathlib import Path
+# ------------------------------------------------------------------ #
+#  Paths
+# ------------------------------------------------------------------ #
+SCRIPT_DIR   = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+TOKENIZER_DIR = PROJECT_ROOT / "tokenizer" / "fineweb_edu_tokenizer"
+sys.path.insert(0, str(PROJECT_ROOT))
+from transformers import PreTrainedTokenizerFast
+from datasets import load_dataset
+SPECIAL_TOKENS = ["<|im_start|>", "<|im_end|>"]
+MAX_LENGTH     = 1024
+ROLE_MAP = {
+    "system":    "system",
+    "human":     "user",
+    "gpt":       "assistant",
+    "user":      "user",
+    "assistant": "assistant",
+}
+# ------------------------------------------------------------------ #
+#  Replicated from prepare_data.py  (no import to keep this self-contained)
+# ------------------------------------------------------------------ #
+def load_tokenizer() -> PreTrainedTokenizerFast:
+    tok = PreTrainedTokenizerFast.from_pretrained(str(TOKENIZER_DIR))
+    new = [t for t in SPECIAL_TOKENS if t not in tok.get_vocab()]
+    if new:
+        tok.add_special_tokens({"additional_special_tokens": new})
+    return tok
+def format_and_tokenize(conversations, tokenizer):
+    """Identical logic to prepare_data.py — returns (input_ids, labels) or None."""
+    input_ids, labels = [], []
+    for turn in conversations:
+        role_raw = turn.get("from", turn.get("role", "")).strip().lower()
+        content  = turn.get("value", turn.get("content", "")).strip()
+        role     = ROLE_MAP.get(role_raw, role_raw)
+        if not content or not role:
+            continue
+        header_text = f"<|im_start|>{role}\n"
+        header_ids  = tokenizer.encode(header_text, add_special_tokens=False)
+        body_text = f"{content}<|im_end|>\n"
+        body_ids  = tokenizer.encode(body_text, add_special_tokens=False)
+        turn_input = header_ids + body_ids
+        if role == "assistant":
+            turn_labels = [-100] * len(header_ids) + body_ids
+        else:
+            turn_labels = [-100] * len(turn_input)
+        input_ids.extend(turn_input)
+        labels.extend(turn_labels)
+    if not any(l != -100 for l in labels):
+        return None
+    input_ids = input_ids[:MAX_LENGTH]
+    labels    = labels[:MAX_LENGTH]
+    if len(input_ids) < 8:
+        return None
+    return input_ids, labels
+# ------------------------------------------------------------------ #
+#  Pretty-print helpers
+# ------------------------------------------------------------------ #
+def print_section(title: str):
+    print(f"\n{'─'*60}")
+    print(f"  {title}")
+    print(f"{'─'*60}")
+def print_token_table(input_ids, labels, tokenizer, max_rows: int = 80):
+    """
+    Prints a table:  idx | token_str | label  (✓ or ✗)
+    Green ✓ = labeled (assistant) — model learns this
+    Red   ✗ = masked -100         — model ignores this
+    """
+    GREEN = "\033[92m"
+    RED   = "\033[91m"
+    RESET = "\033[0m"
+    print(f"\n  {'IDX':>5}  {'TOKEN':<22}  {'ID':>6}  {'LABEL':>8}  {'LEARN?'}")
+    print(f"  {'─'*5}  {'─'*22}  {'─'*6}  {'─'*8}  {'─'*6}")
+    shown = 0
+    for i, (tok_id, lbl) in enumerate(zip(input_ids, labels)):
+        tok_str = repr(tokenizer.decode([tok_id]))[:22]
+        if lbl == -100:
+            learn_str = f"{RED}✗ masked{RESET}"
+            lbl_str   = "    -100"
+        else:
+            learn_str = f"{GREEN}✓ learn {RESET}"
+            lbl_str   = f"{lbl:>8}"
+        print(f"  {i:>5}  {tok_str:<22}  {tok_id:>6}  {lbl_str}  {learn_str}")
+        shown += 1
+        if shown >= max_rows:
+            remaining = len(input_ids) - max_rows
+            print(f"  ... ({remaining} more tokens not shown)")
+            break
+    # Summary
+    n_labeled = sum(1 for l in labels if l != -100)
+    n_total   = len(labels)
+    print(f"\n  Total tokens : {n_total}")
+    print(f"  Labeled      : {n_labeled}  ({n_labeled/n_total:.1%})  ← assistant tokens")
+    print(f"  Masked       : {n_total - n_labeled}  ({(n_total-n_labeled)/n_total:.1%})  ← user/system tokens")
+# ------------------------------------------------------------------ #
+#  MAIN
+# ------------------------------------------------------------------ #
+def parse_args():
+    p = argparse.ArgumentParser(description="Check one OpenHermes row through the SFT pipeline")
+    p.add_argument("--row", type=int, default=0,
+                   help="Which row to inspect in detail (0-indexed, from the first 20 fetched)")
+    p.add_argument("--n_fetch", type=int, default=20,
+                   help="How many rows to fetch from HuggingFace (default: 20)")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    print("\n" + "=" * 60)
+    print("  SFT Pipeline — Data Alignment Check")
+    print("=" * 60)
+    # ---- 1. Tokenizer ---------------------------------------------- #
+    print_section("1. Tokenizer")
+    tokenizer = load_tokenizer()
+    im_start_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
+    im_end_id   = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    print(f"  Vocab size    : {len(tokenizer):,}")
+    print(f"  <|im_start|>  : token ID {im_start_id}")
+    print(f"  <|im_end|>    : token ID {im_end_id}")
+    assert im_start_id != tokenizer.unk_token_id, "ERROR: <|im_start|> not in vocab!"
+    assert im_end_id   != tokenizer.unk_token_id, "ERROR: <|im_end|> not in vocab!"
+    print("  ✓ Special tokens present in vocab")
+    # ---- 2. Load one row ------------------------------------------- #
+    print_section(f"2. Loading row {args.row} from OpenHermes-2.5")
+    print(f"  Loading first {args.n_fetch} rows from local cache (Arrow format)...")
+    ds    = load_dataset("teknium/OpenHermes-2.5", split="train")
+    row   = ds[args.row]
+    convs = row.get("conversations", [])
+    print(f"  Row index     : {args.row}")
+    print(f"  Turns in conv : {len(convs)}")
+    # ---- 3. Raw conversation --------------------------------------- #
+    print_section("3. Raw conversation (from dataset)")
+    for i, turn in enumerate(convs):
+        role    = turn.get("from", "?")
+        content = turn.get("value", "").strip()
+        preview = content[:120].replace("\n", "↵")
+        print(f"  [{i}] from={role!r:12s}  |  {preview!r}")
+    # ---- 4. ChatML formatted text ---------------------------------- #
+    print_section("4. ChatML text (what tokenizer sees)")
+    chatml = ""
+    for turn in convs:
+        role_raw = turn.get("from", "").strip().lower()
+        content  = turn.get("value", "").strip()
+        role     = ROLE_MAP.get(role_raw, role_raw)
+        if content and role:
+            chatml += f"<|im_start|>{role}\n{content}<|im_end|>\n"
+    print(chatml[:800])
+    if len(chatml) > 800:
+        print(f"  ... ({len(chatml) - 800} more chars)")
+    # ---- 5. Run through format_and_tokenize ----------------------- #
+    print_section("5. format_and_tokenize() output")
+    result = format_and_tokenize(convs, tokenizer)
+    if result is None:
+        print("  ✗ RETURNED None — no assistant turn or too short.")
+        print("  Try a different --row index.")
+        return
+    input_ids, labels = result
+    print(f"  input_ids length : {len(input_ids)}")
+    print(f"  labels length    : {len(labels)}")
+    assert len(input_ids) == len(labels), "MISMATCH: input_ids and labels have different lengths!"
+    print("  ✓ Lengths match")
+    # ---- 6. Verify label alignment --------------------------------- #
+    print_section("6. Label alignment sanity checks")
+    # Every im_start should be masked
+    im_start_positions = [i for i, t in enumerate(input_ids) if t == im_start_id]
+    im_end_positions   = [i for i, t in enumerate(input_ids) if t == im_end_id]
+    print(f"  <|im_start|> positions : {im_start_positions}")
+    print(f"  <|im_end|>   positions : {im_end_positions}")
+    im_start_masked = all(labels[i] == -100 for i in im_start_positions)
+    print(f"  All <|im_start|> tokens are masked (-100) : {'✓' if im_start_masked else '✗ FAIL'}")
+    # Decode the labeled span to confirm it's the assistant content
+    labeled_ids = [t for t, l in zip(input_ids, labels) if l != -100]
+    labeled_text = tokenizer.decode(labeled_ids, skip_special_tokens=False)
+    print(f"\n  Labeled (assistant) text preview:")
+    print(f"  {labeled_text[:300].replace(chr(10), '↵')!r}")
+    # Check that labeled text doesn't contain user/system markers
+    if "user\n" in labeled_text or "system\n" in labeled_text:
+        print("  ✗ WARNING: user/system content found in labeled tokens!")
+    else:
+        print("  ✓ Labeled tokens contain only assistant content")
+    # ---- 7. Token-by-token table ----------------------------------- #
+    print_section("7. Token-by-token table (first 80 tokens)")
+    print_token_table(input_ids, labels, tokenizer, max_rows=80)
+    # ---- 8. Decode round-trip ------------------------------------- #
+    print_section("8. Full decode round-trip (skip_special_tokens=False)")
+    decoded = tokenizer.decode(input_ids, skip_special_tokens=False)
+    print(decoded[:600])
+    print("\n" + "=" * 60)
+    print("  CHECK COMPLETE — pipeline looks aligned ✓")
+    print("=" * 60)
+    print(f"\nWhen ready, run the full data prep:")
+    print(f"  python finetune/prepare_data.py")
+if __name__ == "__main__":
+    main()

finetune/data/meta.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "dataset": "teknium/OpenHermes-2.5",
+  "n_sampled": 80000,
+  "n_train": 76000,
+  "n_val": 4000,
+  "vocab_size": 32002,
+  "special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "max_length": 1024,
+  "seed": 42
+}

finetune/data/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

finetune/data/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "tokenizer_class": "TokenizersBackend",
+  "truncation_side": "right",
+  "unk_token": null
+}

finetune/prepare_data.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+finetune/prepare_data.py
+Downloads teknium/OpenHermes-2.5 from HuggingFace, formats conversations
+as ChatML, tokenizes with our custom tokenizer + 2 new special tokens,
+and saves train_sft.pt / val_sft.pt to finetune/data/.
+Also saves the tokenizer (with special tokens baked in) to finetune/data/
+so sft_train.py and chat.py can load it without re-adding tokens.
+Usage:
+    python finetune/prepare_data.py
+    python finetune/prepare_data.py --n_samples 50000
+Dataset structure (OpenHermes-2.5):
+    Each row has a "conversations" key:
+    [
+        {"from": "system",  "value": "..."},   # optional
+        {"from": "human",   "value": "..."},
+        {"from": "gpt",     "value": "..."},
+        ...                                     # may have more turns
+    ]
+"""
+import os
+import sys
+import json
+import random
+import argparse
+from pathlib import Path
+import torch
+from transformers import PreTrainedTokenizerFast
+from datasets import load_dataset
+from tqdm import tqdm
+# ------------------------------------------------------------------ #
+#  Paths  (relative to project root, not this script)
+# ------------------------------------------------------------------ #
+SCRIPT_DIR   = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+TOKENIZER_DIR = PROJECT_ROOT / "tokenizer" / "fineweb_edu_tokenizer"
+# The two new tokens that define ChatML structure
+SPECIAL_TOKENS = ["<|im_start|>", "<|im_end|>"]
+MAX_LENGTH = 1024   # model context_length — truncate anything longer
+# Map OpenHermes role names → ChatML role names
+ROLE_MAP = {
+    "system":    "system",
+    "human":     "user",
+    "gpt":       "assistant",
+    "user":      "user",
+    "assistant": "assistant",
+}
+# ------------------------------------------------------------------ #
+#  TOKENIZER
+# ------------------------------------------------------------------ #
+def load_and_extend_tokenizer() -> PreTrainedTokenizerFast:
+    """
+    Loads our pretrained BPE tokenizer and adds the two ChatML tokens.
+    Returns the extended tokenizer (vocab 32,000 → 32,002).
+    """
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(str(TOKENIZER_DIR))
+    new_tokens = [t for t in SPECIAL_TOKENS if t not in tokenizer.get_vocab()]
+    if new_tokens:
+        added = tokenizer.add_special_tokens({"additional_special_tokens": new_tokens})
+        print(f"  Added {added} special token(s): {new_tokens}")
+    else:
+        print("  Special tokens already present — skipping add.")
+    print(f"  Final vocab size: {len(tokenizer):,}")
+    return tokenizer
+# ------------------------------------------------------------------ #
+#  FORMAT + TOKENIZE ONE CONVERSATION
+# ------------------------------------------------------------------ #
+def format_and_tokenize(
+    conversations: list[dict],
+    tokenizer:     PreTrainedTokenizerFast,
+) -> tuple[list[int], list[int]] | None:
+    """
+    Converts a list of chat turns into (input_ids, labels).
+    ChatML format per turn:
+        <|im_start|>{role}\\n{content}<|im_end|>\\n
+    Labels:
+        - User / system turns  → all -100  (not learned)
+        - Assistant turns      → header (-100) + content (actual token ids)
+          i.e. we learn the response but not the "<|im_start|>assistant\\n" prefix
+    Returns None for:
+        - Conversations with no assistant turns (nothing to learn)
+        - Conversations that tokenize to fewer than 8 tokens
+    """
+    input_ids: list[int] = []
+    labels:    list[int] = []
+    for turn in conversations:
+        role_raw = turn.get("from", turn.get("role", "")).strip().lower()
+        content  = turn.get("value", turn.get("content", "")).strip()
+        role     = ROLE_MAP.get(role_raw, role_raw)
+        if not content or not role:
+            continue
+        # ---- header: <|im_start|>role\n  — never labeled ----------- #
+        header_text = f"<|im_start|>{role}\n"
+        header_ids  = tokenizer.encode(header_text, add_special_tokens=False)
+        # ---- body: content<|im_end|>\n ------------------------------ #
+        body_text = f"{content}<|im_end|>\n"
+        body_ids  = tokenizer.encode(body_text, add_special_tokens=False)
+        turn_input  = header_ids + body_ids
+        if role == "assistant":
+            # Teach the model the body (response + im_end), not the header
+            turn_labels = [-100] * len(header_ids) + body_ids
+        else:
+            # User / system: no learning signal
+            turn_labels = [-100] * len(turn_input)
+        input_ids.extend(turn_input)
+        labels.extend(turn_labels)
+    # Must have at least one labeled token to be a valid training example
+    if not any(l != -100 for l in labels):
+        return None
+    # Truncate to context window
+    input_ids = input_ids[:MAX_LENGTH]
+    labels    = labels[:MAX_LENGTH]
+    # Skip micro-sequences (likely malformed)
+    if len(input_ids) < 8:
+        return None
+    return input_ids, labels
+# ------------------------------------------------------------------ #
+#  ARG PARSING
+# ------------------------------------------------------------------ #
+def parse_args():
+    p = argparse.ArgumentParser(description="Prepare SFT data from OpenHermes-2.5")
+    p.add_argument("--n_samples",   type=int,   default=80_000,
+                   help="Number of conversations to sample (default: 80000)")
+    p.add_argument("--val_ratio",   type=float, default=0.05,
+                   help="Fraction held out for validation (default: 0.05)")
+    p.add_argument("--output_dir",  type=str,   default=str(SCRIPT_DIR / "data"),
+                   help="Where to save train_sft.pt, val_sft.pt, and tokenizer")
+    p.add_argument("--seed",        type=int,   default=42)
+    return p.parse_args()
+# ------------------------------------------------------------------ #
+#  MAIN
+# ------------------------------------------------------------------ #
+def main():
+    args = parse_args()
+    random.seed(args.seed)
+    os.makedirs(args.output_dir, exist_ok=True)
+    print("\n" + "=" * 60)
+    print("  SLLM-150M SFT — Data Preparation")
+    print("=" * 60)
+    # ---------------------------------------------------------------- #
+    # 1. Tokenizer
+    # ---------------------------------------------------------------- #
+    print("\n[1/4] Loading tokenizer + adding ChatML special tokens...")
+    tokenizer = load_and_extend_tokenizer()
+    # Save the extended tokenizer to data dir so training/chat can load it
+    tokenizer.save_pretrained(args.output_dir)
+    print(f"  Extended tokenizer saved → {args.output_dir}/")
+    # ---------------------------------------------------------------- #
+    # 2. Dataset download
+    # ---------------------------------------------------------------- #
+    print(f"\n[2/4] Loading teknium/OpenHermes-2.5 from HuggingFace...")
+    ds = load_dataset("teknium/OpenHermes-2.5")
+    full = ds["train"]   # only split in this dataset
+    print(f"  Full dataset size: {len(full):,} examples")
+    # Sample a subset
+    n = min(args.n_samples, len(full))
+    indices = random.sample(range(len(full)), n)
+    subset  = full.select(indices)
+    print(f"  Sampled: {n:,} examples (seed={args.seed})")
+    # ---------------------------------------------------------------- #
+    # 3. Tokenize
+    # ---------------------------------------------------------------- #
+    print(f"\n[3/4] Formatting and tokenizing conversations...")
+    all_input_ids: list[torch.Tensor] = []
+    all_labels:    list[torch.Tensor] = []
+    skipped = 0
+    for example in tqdm(subset, desc="Tokenizing", unit="conv"):
+        conversations = example.get("conversations", [])
+        result = format_and_tokenize(conversations, tokenizer)
+        if result is None:
+            skipped += 1
+            continue
+        ids, lbls = result
+        all_input_ids.append(torch.tensor(ids,  dtype=torch.long))
+        all_labels.append(   torch.tensor(lbls, dtype=torch.long))
+    total = len(all_input_ids)
+    print(f"\n  Kept   : {total:,}")
+    print(f"  Skipped: {skipped:,}  (no assistant turn or too short)")
+    if total == 0:
+        raise RuntimeError("No valid examples produced — check dataset structure.")
+    # Print a sample so we can visually verify
+    print("\n  ── Sample (first conversation, first 400 chars) ──")
+    sample_decoded = tokenizer.decode(all_input_ids[0].tolist(), skip_special_tokens=False)
+    print("  " + sample_decoded[:400].replace("\n", "\n  "))
+    print()
+    # ---------------------------------------------------------------- #
+    # 4. Split + save
+    # ---------------------------------------------------------------- #
+    print(f"[4/4] Splitting and saving...")
+    perm    = list(range(total))
+    random.shuffle(perm)
+    val_n   = max(1, int(total * args.val_ratio))
+    train_n = total - val_n
+    train_ids = [all_input_ids[i] for i in perm[:train_n]]
+    train_lbl = [all_labels[i]    for i in perm[:train_n]]
+    val_ids   = [all_input_ids[i] for i in perm[train_n:]]
+    val_lbl   = [all_labels[i]    for i in perm[train_n:]]
+    train_path = os.path.join(args.output_dir, "train_sft.pt")
+    val_path   = os.path.join(args.output_dir, "val_sft.pt")
+    torch.save({"input_ids": train_ids, "labels": train_lbl}, train_path)
+    torch.save({"input_ids": val_ids,   "labels": val_lbl},   val_path)
+    # Stats
+    lengths       = [len(x) for x in all_input_ids]
+    label_ratios  = [(t != -100).float().mean().item() for t in all_labels]
+    avg_len       = sum(lengths) / len(lengths)
+    avg_lbl_ratio = sum(label_ratios) / len(label_ratios)
+    print(f"\n  train_sft.pt  : {train_n:,} examples")
+    print(f"  val_sft.pt    : {val_n:,}   examples")
+    print(f"\n  Avg seq length         : {avg_len:.0f} tokens  (max={max(lengths)})")
+    print(f"  Avg assistant ratio    : {avg_lbl_ratio:.1%}  of tokens are labeled")
+    # Save metadata for reference
+    meta = {
+        "dataset":        "teknium/OpenHermes-2.5",
+        "n_sampled":      n,
+        "n_train":        train_n,
+        "n_val":          val_n,
+        "vocab_size":     len(tokenizer),
+        "special_tokens": SPECIAL_TOKENS,
+        "max_length":     MAX_LENGTH,
+        "seed":           args.seed,
+    }
+    with open(os.path.join(args.output_dir, "meta.json"), "w") as f:
+        json.dump(meta, f, indent=2)
+    print(f"\n  meta.json saved → {args.output_dir}/meta.json")
+    print("\n" + "=" * 60)
+    print("  Data preparation complete!")
+    print("=" * 60)
+    print(f"""
+Next step:
+    python finetune/sft_train.py \\
+        --base_ckpt runs/sllm_150m/ckpt_0011500.pt \\
+        --run_dir   runs/sllm_150m_chat \\
+        --max_steps 2000 \\
+        --batch_size 4 --grad_accum 8 \\
+        --grad_checkpoint
+""")
+if __name__ == "__main__":
+    main()

finetune/sft_dataset.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+finetune/sft_dataset.py
+SFT Dataset — loads pre-tokenized ChatML sequences from .pt shards
+produced by prepare_data.py.
+Each item returns (input_ids, labels) where labels has -100 for all
+non-assistant tokens so CrossEntropy only trains on assistant responses.
+"""
+from functools import partial
+import torch
+from torch.utils.data import Dataset, DataLoader
+class SFTDataset(Dataset):
+    """
+    Dataset for Supervised Fine-Tuning.
+    Loads a .pt shard containing:
+        {
+          "input_ids": list of LongTensors  (variable length),
+          "labels":    list of LongTensors  (same shapes, -100 for masked)
+        }
+    Each __getitem__ returns:
+        input_ids : (seq_len,) LongTensor
+        labels    : (seq_len,) LongTensor  — -100 for user/system tokens
+    """
+    def __init__(self, data_path: str, context_length: int = 1024):
+        data = torch.load(data_path, weights_only=False)
+        self.input_ids      = data["input_ids"]
+        self.labels         = data["labels"]
+        self.context_length = context_length
+        assert len(self.input_ids) == len(self.labels), "input_ids / labels length mismatch"
+        print(f"[SFTDataset] Loaded {len(self.input_ids):,} examples from {data_path}")
+    def __len__(self) -> int:
+        return len(self.input_ids)
+    def __getitem__(self, idx):
+        ids = self.input_ids[idx]
+        lbl = self.labels[idx]
+        # Hard-truncate to model context length
+        if len(ids) > self.context_length:
+            ids = ids[: self.context_length]
+            lbl = lbl[: self.context_length]
+        return ids, lbl
+# ------------------------------------------------------------------ #
+#  COLLATE
+# ------------------------------------------------------------------ #
+def sft_collate_fn(batch, pad_token_id: int):
+    """
+    Pads a batch of variable-length sequences to the same length.
+        input_ids → padded with pad_token_id
+        labels    → padded with -100  (ignored by CrossEntropy)
+    """
+    input_ids_list, labels_list = zip(*batch)
+    max_len = max(x.size(0) for x in input_ids_list)
+    input_ids_padded = torch.full((len(batch), max_len), pad_token_id, dtype=torch.long)
+    labels_padded    = torch.full((len(batch), max_len), -100,         dtype=torch.long)
+    for i, (ids, lbl) in enumerate(zip(input_ids_list, labels_list)):
+        n = ids.size(0)
+        input_ids_padded[i, :n] = ids
+        labels_padded[i, :n]    = lbl
+    return input_ids_padded, labels_padded
+# ------------------------------------------------------------------ #
+#  FACTORY
+# ------------------------------------------------------------------ #
+def build_sft_dataloader(
+    data_path:      str,
+    batch_size:     int,
+    pad_token_id:   int,
+    context_length: int  = 1024,
+    num_workers:    int  = 0,
+    shuffle:        bool = True,
+) -> DataLoader:
+    dataset    = SFTDataset(data_path, context_length=context_length)
+    collate_fn = partial(sft_collate_fn, pad_token_id=pad_token_id)
+    return DataLoader(
+        dataset,
+        batch_size  = batch_size,
+        shuffle     = shuffle,
+        num_workers = num_workers,
+        collate_fn  = collate_fn,
+        pin_memory  = True,
+    )

finetune/sft_train.py ADDED Viewed

	@@ -0,0 +1,563 @@

+"""
+finetune/sft_train.py
+Full Supervised Fine-Tuning (SFT) of SLLM-150M → Chat Model.
+Starts from the pretrained base checkpoint, resizes the token embedding
+for 2 new ChatML special tokens, then trains with masked CrossEntropy
+so only assistant response tokens contribute to the loss.
+Usage (first run):
+    python finetune/sft_train.py \\
+        --base_ckpt runs/sllm_150m/ckpt_0011500.pt \\
+        --run_dir   runs/sllm_150m_chat \\
+        --max_steps 2000 \\
+        --batch_size 4 --grad_accum 8 \\
+        --grad_checkpoint
+Resume:
+    python finetune/sft_train.py \\
+        --resume --run_dir runs/sllm_150m_chat \\
+        --extra_steps 1000
+"""
+import os
+import sys
+import json
+import math
+import time
+import signal
+import argparse
+from pathlib import Path
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.amp import autocast, GradScaler
+from transformers import PreTrainedTokenizerFast
+from tqdm import tqdm
+# ------------------------------------------------------------------ #
+#  Resolve project root so model/ is importable
+# ------------------------------------------------------------------ #
+SCRIPT_DIR   = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent
+DATA_DIR     = SCRIPT_DIR / "data"
+sys.path.insert(0, str(PROJECT_ROOT))
+sys.path.insert(0, str(SCRIPT_DIR))   # so we can import sft_dataset
+from model.config    import SLLM_150M
+from model.model     import SLLM
+from sft_dataset     import build_sft_dataloader
+# ------------------------------------------------------------------ #
+#  ARG PARSING
+# ------------------------------------------------------------------ #
+def parse_args():
+    p = argparse.ArgumentParser(description="SLLM-150M SFT Training")
+    # Checkpoints
+    p.add_argument("--base_ckpt",   type=str,
+                   default=str(PROJECT_ROOT / "runs" / "sllm_150m" / "ckpt_0011500.pt"),
+                   help="Path to pretrained base checkpoint (.pt)")
+    p.add_argument("--run_dir",     type=str, default="runs/sllm_150m_chat",
+                   help="Output directory for SFT checkpoints and logs")
+    p.add_argument("--resume",      action="store_true",
+                   help="Resume from latest SFT checkpoint in --run_dir")
+    p.add_argument("--max_steps",   type=int, default=2000,
+                   help="Absolute step target for this run")
+    p.add_argument("--extra_steps", type=int, default=None,
+                   help="Run N more steps from current checkpoint (relative)")
+    # Data
+    p.add_argument("--data_dir",    type=str, default=str(DATA_DIR),
+                   help="Directory with train_sft.pt, val_sft.pt, and tokenizer files")
+    p.add_argument("--num_workers", type=int, default=0)
+    # Optimisation — note: much lower LR than pretraining
+    p.add_argument("--batch_size",    type=int,   default=4)
+    p.add_argument("--grad_accum",    type=int,   default=8)
+    p.add_argument("--max_lr",        type=float, default=1e-5,
+                   help="Peak LR (10x lower than pretraining)")
+    p.add_argument("--min_lr",        type=float, default=1e-6)
+    p.add_argument("--warmup_steps",  type=int,   default=30)
+    p.add_argument("--weight_decay",  type=float, default=0.1)
+    p.add_argument("--grad_clip",     type=float, default=1.0)
+    p.add_argument("--dropout",       type=float, default=0.1,
+                   help="Dropout rate during SFT (0.0 in pretraining)")
+    # Memory
+    p.add_argument("--grad_checkpoint", action="store_true",
+                   help="Enable gradient checkpointing (saves VRAM)")
+    p.add_argument("--dtype",           type=str, default="bf16",
+                   choices=["fp32", "fp16", "bf16"])
+    # Logging
+    p.add_argument("--log_every",   type=int, default=10)
+    p.add_argument("--save_every",  type=int, default=500)
+    p.add_argument("--val_every",   type=int, default=250)
+    p.add_argument("--val_steps",   type=int, default=20)
+    return p.parse_args()
+# ------------------------------------------------------------------ #
+#  VOCAB RESIZE
+# ------------------------------------------------------------------ #
+def resize_token_embeddings(model: SLLM, new_vocab_size: int):
+    """
+    Grows model.token_emb from old_vocab_size → new_vocab_size.
+    New rows are initialised to the mean of existing embeddings so
+    training starts from a stable point rather than random noise.
+    lm_head weight-tying is re-applied automatically.
+    """
+    old_size = model.config.vocab_size
+    if new_vocab_size == old_size:
+        return
+    if new_vocab_size < old_size:
+        raise ValueError(f"Cannot shrink vocab ({old_size} → {new_vocab_size})")
+    d_model    = model.config.d_model
+    device     = model.token_emb.weight.device
+    dtype      = model.token_emb.weight.dtype
+    old_weight = model.token_emb.weight.data.clone()   # (old_size, d)
+    mean_vec   = old_weight.mean(dim=0)                # (d,)
+    new_weight = torch.zeros(new_vocab_size, d_model, dtype=dtype, device=device)
+    new_weight[:old_size] = old_weight
+    # Broadcast mean_vec into new rows
+    new_weight[old_size:] = mean_vec.unsqueeze(0).expand(new_vocab_size - old_size, -1)
+    # Replace the embedding module in-place
+    new_emb = nn.Embedding(new_vocab_size, d_model).to(device=device, dtype=dtype)
+    new_emb.weight.data = new_weight
+    model.token_emb = new_emb
+    # Re-tie the LM head to the (now larger) embedding
+    model.lm_head.weight = model.token_emb.weight
+    # Keep config consistent
+    model.config.vocab_size = new_vocab_size
+    n_new = new_vocab_size - old_size
+    print(f"  Vocab resized: {old_size:,} → {new_vocab_size:,}  (+{n_new} tokens, init=mean)")
+# ------------------------------------------------------------------ #
+#  DROPOUT
+# ------------------------------------------------------------------ #
+def set_dropout(model: SLLM, rate: float):
+    """Applies dropout rate to every nn.Dropout in the model."""
+    count = 0
+    for m in model.modules():
+        if isinstance(m, nn.Dropout):
+            m.p = rate
+            count += 1
+    if count:
+        print(f"  Dropout set to {rate} on {count} layer(s)")
+# ------------------------------------------------------------------ #
+#  LR SCHEDULE  (cosine with linear warmup, same shape as train.py)
+# ------------------------------------------------------------------ #
+def get_lr(step: int, warmup_steps: int, total_steps: int,
+           max_lr: float, min_lr: float) -> float:
+    if step < warmup_steps:
+        return max_lr * (step + 1) / warmup_steps
+    decay_steps = total_steps if total_steps else 5_000
+    if step >= decay_steps:
+        return min_lr
+    progress = (step - warmup_steps) / max(1, decay_steps - warmup_steps)
+    coeff    = 0.5 * (1.0 + math.cos(math.pi * progress))
+    return min_lr + coeff * (max_lr - min_lr)
+# ------------------------------------------------------------------ #
+#  OPTIMIZER  (mirrors train.py — AdamW selective decay)
+# ------------------------------------------------------------------ #
+def build_optimizer(model: SLLM, lr: float, weight_decay: float):
+    decay, no_decay = [], []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+        if param.dim() >= 2:
+            decay.append(param)
+        else:
+            no_decay.append(param)
+    groups = [
+        {"params": decay,    "weight_decay": weight_decay},
+        {"params": no_decay, "weight_decay": 0.0},
+    ]
+    n_d  = sum(p.numel() for p in decay)
+    n_nd = sum(p.numel() for p in no_decay)
+    print(f"  Optimizer: {n_d/1e6:.1f}M decay | {n_nd/1e6:.1f}M no-decay | lr={lr:.2e}")
+    # Note: no fused=True here — new embedding rows need correct grad flow
+    return torch.optim.AdamW(groups, lr=lr, betas=(0.9, 0.95), eps=1e-8)
+# ------------------------------------------------------------------ #
+#  CHECKPOINT SAVE / LOAD
+# ------------------------------------------------------------------ #
+def save_checkpoint(path: str, model: SLLM, optimizer, step: int,
+                    loss: float, vocab_size: int):
+    os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+    torch.save({
+        "step":                 step,
+        "model_state_dict":     model.state_dict(),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "loss":                 loss,
+        "vocab_size":           vocab_size,
+    }, path)
+    print(f"\n  [CKPT] Saved: {path}  (step={step}, loss={loss:.4f})")
+def load_sft_checkpoint(run_dir: str, model: SLLM, optimizer, device):
+    """Loads the latest ckpt_sft_*.pt from run_dir. Returns (step, vocab_size)."""
+    ckpts = sorted([
+        f for f in os.listdir(run_dir)
+        if f.startswith("ckpt_sft_") and f.endswith(".pt")
+    ])
+    if not ckpts:
+        raise FileNotFoundError(f"No SFT checkpoints found in {run_dir}")
+    path = os.path.join(run_dir, ckpts[-1])
+    ckpt = torch.load(path, map_location=device, weights_only=False)
+    model.load_state_dict(ckpt["model_state_dict"])
+    optimizer.load_state_dict(ckpt["optimizer_state_dict"])
+    step       = ckpt["step"]
+    vocab_size = ckpt.get("vocab_size", model.config.vocab_size)
+    loss       = ckpt.get("loss", float("nan"))
+    print(f"  [CKPT] Resumed from: {path}  (step={step}, loss={loss:.4f})")
+    return step, vocab_size
+# ------------------------------------------------------------------ #
+#  VALIDATION  (uses ignore_index=-100 like training)
+# ------------------------------------------------------------------ #
+@torch.no_grad()
+def estimate_val_loss(model: SLLM, val_loader, val_steps: int,
+                      device, dtype_ctx) -> float:
+    model.eval()
+    losses = []
+    for i, (x, y) in enumerate(val_loader):
+        if i >= val_steps:
+            break
+        x, y = x.to(device), y.to(device)
+        with dtype_ctx:
+            logits, _ = model(x)
+            # Shift logits and labels by 1 to predict the next token
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = y[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        losses.append(loss.item())
+    model.train()
+    return sum(losses) / len(losses) if losses else float("nan")
+# ------------------------------------------------------------------ #
+#  METRIC LOGGER
+# ------------------------------------------------------------------ #
+class MetricLogger:
+    def __init__(self, log_path: str):
+        self.log_path = log_path
+        os.makedirs(os.path.dirname(os.path.abspath(log_path)), exist_ok=True)
+        print(f"  [LOG] Logging to: {log_path}")
+    def log(self, **kwargs):
+        with open(self.log_path, "a") as f:
+            f.write(json.dumps(kwargs) + "\n")
+# ------------------------------------------------------------------ #
+#  MAIN TRAINING LOOP
+# ------------------------------------------------------------------ #
+def train():
+    args   = parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\n{'='*60}")
+    print(f"  SLLM-150M  →  Chat Model  (SFT)")
+    print(f"{'='*60}")
+    print(f"\nDevice  : {device}")
+    if device.type == "cuda":
+        print(f"GPU     : {torch.cuda.get_device_name(0)}")
+        print(f"VRAM    : {torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB")
+    # ---- dtype ----------------------------------------------------- #
+    if args.dtype == "bf16" and device.type == "cuda" and torch.cuda.is_bf16_supported():
+        dtype_torch, dtype_name = torch.bfloat16, "bf16"
+    elif args.dtype == "fp16" and device.type == "cuda":
+        dtype_torch, dtype_name = torch.float16, "fp16"
+    else:
+        dtype_torch, dtype_name = torch.float32, "fp32"
+    print(f"dtype   : {dtype_name}")
+    use_amp   = dtype_torch in (torch.float16, torch.bfloat16)
+    dtype_ctx = (autocast(device_type=device.type, dtype=dtype_torch)
+                 if use_amp else torch.no_grad().__class__())
+    scaler    = GradScaler(enabled=(dtype_torch == torch.float16))
+    # ---- Tokenizer ------------------------------------------------- #
+    print("\n[1/5] Loading tokenizer...")
+    tok_path = args.data_dir
+    if os.path.exists(os.path.join(tok_path, "tokenizer.json")):
+        # Prefer the saved tokenizer from prepare_data.py (has special tokens)
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(tok_path)
+        print(f"  Loaded from data dir: {tok_path}")
+    else:
+        # Fallback: load base tokenizer and add special tokens manually
+        base_tok_dir = str(PROJECT_ROOT / "tokenizer" / "fineweb_edu_tokenizer")
+        tokenizer    = PreTrainedTokenizerFast.from_pretrained(base_tok_dir)
+        tokenizer.add_special_tokens({"additional_special_tokens":
+                                      ["<|im_start|>", "<|im_end|>"]})
+        print(f"  Loaded base tokenizer + added special tokens")
+    new_vocab_size = len(tokenizer)
+    pad_id         = tokenizer.pad_token_id if tokenizer.pad_token_id is not None \
+                     else tokenizer.eos_token_id
+    print(f"  Vocab size : {new_vocab_size:,}")
+    print(f"  Pad token  : {pad_id}")
+    # ---- Model ----------------------------------------------------- #
+    print("\n[2/5] Loading model...")
+    cfg   = SLLM_150M
+    model = SLLM(cfg).to(device)
+    if not args.resume:
+        # Load pretrained base weights (step 11,500)
+        print(f"  Loading base checkpoint: {args.base_ckpt}")
+        base_ckpt = torch.load(args.base_ckpt, map_location=device, weights_only=False)
+        model.load_state_dict(base_ckpt["model_state_dict"])
+        base_step = base_ckpt.get("step", "?")
+        base_loss = base_ckpt.get("loss", float("nan"))
+        print(f"  Base model  step={base_step}  loss={base_loss:.4f}")
+        del base_ckpt
+    # Grow embedding for the 2 new special tokens
+    resize_token_embeddings(model, new_vocab_size)
+    # Apply SFT dropout (was 0.0 in pretraining)
+    set_dropout(model, args.dropout)
+    if args.grad_checkpoint:
+        model.enable_gradient_checkpointing()
+        print("  Gradient checkpointing: ON")
+    print(f"  Model params: {model.count_params()/1e6:.1f}M")
+    # ---- Optimizer ------------------------------------------------- #
+    print("\n[3/5] Building optimizer...")
+    optimizer = build_optimizer(model, lr=args.max_lr, weight_decay=args.weight_decay)
+    # ---- Resume from SFT checkpoint -------------------------------- #
+    start_step = 0
+    if args.resume:
+        try:
+            start_step, _ = load_sft_checkpoint(args.run_dir, model, optimizer, device)
+        except FileNotFoundError as e:
+            print(f"  [WARN] {e} — starting SFT from base checkpoint.")
+    # Resolve --extra_steps → --max_steps
+    if args.extra_steps is not None:
+        args.max_steps = start_step + args.extra_steps
+        print(f"  --extra_steps {args.extra_steps} → max_steps={args.max_steps}")
+    if args.max_steps is not None and start_step >= args.max_steps:
+        print(f"\n  [WARN] Already at step {start_step} >= max_steps {args.max_steps}.")
+        print(f"  Use --extra_steps N to run N more steps.")
+        return
+    # ---- Data ------------------------------------------------------ #
+    print("\n[4/5] Loading SFT dataset...")
+    train_path = os.path.join(args.data_dir, "train_sft.pt")
+    val_path   = os.path.join(args.data_dir, "val_sft.pt")
+    train_loader = build_sft_dataloader(
+        data_path=train_path, batch_size=args.batch_size,
+        pad_token_id=pad_id, context_length=cfg.context_length,
+        num_workers=args.num_workers, shuffle=True,
+    )
+    val_loader = build_sft_dataloader(
+        data_path=val_path, batch_size=args.batch_size,
+        pad_token_id=pad_id, context_length=cfg.context_length,
+        num_workers=0, shuffle=False,
+    )
+    # ---- Run dir + logger ------------------------------------------ #
+    os.makedirs(args.run_dir, exist_ok=True)
+    log_path = os.path.join(args.run_dir, "sft_log.jsonl")
+    logger   = MetricLogger(log_path)
+    # ---- Training info --------------------------------------------- #
+    eff_batch = args.batch_size * args.grad_accum
+    print(f"\n[5/5] Training config:")
+    print(f"  batch_size     : {args.batch_size}  (grad_accum={args.grad_accum} → eff={eff_batch})")
+    print(f"  max_steps      : {args.max_steps}")
+    print(f"  start_step     : {start_step}")
+    print(f"  steps to run   : {(args.max_steps - start_step) if args.max_steps else '∞'}")
+    print(f"  max_lr / min_lr: {args.max_lr:.2e} / {args.min_lr:.2e}")
+    print(f"  warmup_steps   : {args.warmup_steps}")
+    print(f"  save_every     : {args.save_every}")
+    print(f"  val_every      : {args.val_every}")
+    # ---- Ctrl+C handler -------------------------------------------- #
+    stop_flag = {"stop": False}
+    def _signal_handler(sig, frame):
+        print("\n  [SIGNAL] Ctrl+C — will save and exit after this step.")
+        stop_flag["stop"] = True
+    signal.signal(signal.SIGINT, _signal_handler)
+    # ================================================================ #
+    #  TRAINING LOOP
+    # ================================================================ #
+    model.train()
+    step         = start_step
+    running_loss = 0.0
+    t_start      = time.time()
+    t_step_start = time.time()
+    data_iter    = iter(train_loader)
+    print(f"\n{'='*60}")
+    print(f"  SFT STARTED  (step {step} → {args.max_steps})")
+    print(f"{'='*60}\n")
+    pbar = tqdm(
+        initial=step, total=args.max_steps,
+        desc="SFT", unit="step", dynamic_ncols=True,
+    )
+    while True:
+        # ---- Stop conditions --------------------------------------- #
+        if stop_flag["stop"]:
+            break
+        if args.max_steps is not None and step >= args.max_steps:
+            print(f"\n  [DONE] Reached max_steps={args.max_steps}")
+            break
+        optimizer.zero_grad(set_to_none=True)
+        accum_loss = 0.0
+        # ---- Gradient accumulation micro-steps --------------------- #
+        for _ in range(args.grad_accum):
+            try:
+                x, y = next(data_iter)
+            except StopIteration:
+                data_iter = iter(train_loader)
+                x, y = next(data_iter)
+            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+            with autocast(device_type=device.type, dtype=dtype_torch, enabled=use_amp):
+                logits, _ = model(x)      # (B, T, V)  — don't use built-in loss
+                # Shift logits and labels by 1 to predict the next token
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = y[..., 1:].contiguous()
+                # Use ignore_index=-100 so only assistant tokens drive the loss
+                loss = F.cross_entropy(
+                    shift_logits.view(-1, shift_logits.size(-1)),
+                    shift_labels.view(-1),
+                    ignore_index=-100,
+                ) / args.grad_accum       # scale for accumulation
+            scaler.scale(loss).backward()
+            accum_loss += loss.item()
+        # ---- Grad clip --------------------------------------------- #
+        if args.grad_clip > 0:
+            scaler.unscale_(optimizer)
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        else:
+            grad_norm = float("nan")
+        # ---- LR ---------------------------------------------------- #
+        lr = get_lr(step, args.warmup_steps, args.max_steps, args.max_lr, args.min_lr)
+        for pg in optimizer.param_groups:
+            pg["lr"] = lr
+        # ---- Optimizer step ---------------------------------------- #
+        scaler.step(optimizer)
+        scaler.update()
+        step        += 1
+        running_loss = accum_loss
+        t_now        = time.time()
+        elapsed_step = t_now - t_step_start
+        t_step_start = t_now
+        pbar.update(1)
+        pbar.set_postfix({"loss": f"{running_loss:.4f}", "lr": f"{lr:.1e}"})
+        # ---- Logging ----------------------------------------------- #
+        if step % args.log_every == 0:
+            entry = {
+                "step":      step,
+                "loss":      round(running_loss, 6),
+                "lr":        lr,
+                "grad_norm": round(float(grad_norm), 4)
+                             if not math.isnan(float(grad_norm)) else None,
+                "elapsed_s": round(t_now - t_start, 1),
+            }
+            if device.type == "cuda":
+                entry["vram_gb"] = round(torch.cuda.memory_allocated() / 1e9, 3)
+            logger.log(**entry)
+        # ---- Validation -------------------------------------------- #
+        if step % args.val_every == 0:
+            v_ctx = autocast(device_type=device.type, dtype=dtype_torch, enabled=use_amp)
+            val_loss = estimate_val_loss(model, val_loader, args.val_steps, device, v_ctx)
+            tqdm.write(
+                f"  [STEP {step:5d}]  train={running_loss:.4f}  "
+                f"val={val_loss:.4f}  lr={lr:.1e}"
+            )
+            logger.log(step=step, val_loss=round(val_loss, 6))
+        # ---- Checkpoint -------------------------------------------- #
+        if step % args.save_every == 0:
+            ckpt_path = os.path.join(args.run_dir, f"ckpt_sft_{step:07d}.pt")
+            save_checkpoint(ckpt_path, model, optimizer, step, running_loss, new_vocab_size)
+    # ================================================================ #
+    #  FINAL SAVE
+    # ================================================================ #
+    pbar.close()
+    steps_done = step - start_step
+    if steps_done > 0:
+        ckpt_path = os.path.join(args.run_dir, f"ckpt_sft_{step:07d}.pt")
+        save_checkpoint(ckpt_path, model, optimizer, step, running_loss, new_vocab_size)
+    else:
+        print("\n  [SKIP] No steps taken — skipping checkpoint save.")
+    total_time = time.time() - t_start
+    print(f"\n{'='*60}")
+    print(f"  SFT COMPLETE")
+    print(f"{'='*60}")
+    print(f"  Steps done : {steps_done}")
+    print(f"  Final loss : {running_loss:.4f}")
+    print(f"  Total time : {total_time/60:.1f} min")
+    print(f"  Run dir    : {args.run_dir}")
+    print(f"\nStart chatting:")
+    print(f"  python finetune/chat.py --run_dir {args.run_dir}")
+if __name__ == "__main__":
+    train()

model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# model/__init__.py
+from model.config import ModelConfig, SLLM_100M, SLLM_150M
+from model.model  import SLLM
+__all__ = ["ModelConfig", "SLLM_100M", "SLLM_150M", "SLLM"]

model/attention.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+model/attention.py
+Causal Multi-Head Self-Attention with RoPE.
+Architecture:
+    Input x  (B, T, d_model)
+      -> Linear projections Q, K, V  (no bias)
+      -> Reshape to (B, n_heads, T, head_dim)
+      -> Apply RoPE to Q and K
+      -> Scaled dot-product attention with causal mask
+      -> Reshape back to (B, T, d_model)
+      -> Output projection O  (no bias)
+Uses torch.nn.functional.scaled_dot_product_attention (Flash Attention
+when available via PyTorch 2.0+) for memory-efficient attention.
+The causal mask is handled by is_causal=True — no need to materialize
+an explicit O(T^2) mask tensor.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.config import ModelConfig
+from model.rope import RoPECache, apply_rope
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.n_heads   = config.n_heads
+        self.head_dim  = config.head_dim
+        self.d_model   = config.d_model
+        self.dropout   = config.dropout
+        # Q, K, V projections fused into one matrix for efficiency
+        # Output: (B, T, 3 * d_model), then split
+        self.qkv_proj = nn.Linear(config.d_model, 3 * config.d_model, bias=config.bias)
+        # Output projection
+        self.o_proj   = nn.Linear(config.d_model, config.d_model, bias=config.bias)
+        # Attention dropout (applied inside sdpa)
+        self.attn_dropout = config.dropout
+        # RoPE cache — lives as a buffer (moves to GPU automatically)
+        self.rope = RoPECache(
+            head_dim    = config.head_dim,
+            max_seq_len = config.context_length,
+            theta       = config.rope_theta,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x : (B, T, d_model)
+        Returns:
+            out : (B, T, d_model)
+        """
+        B, T, C = x.shape                              # C = d_model
+        # ---- QKV projection ---------------------------------------- #
+        qkv = self.qkv_proj(x)                        # (B, T, 3*C)
+        q, k, v = qkv.split(self.d_model, dim=-1)     # each: (B, T, C)
+        # ---- Reshape to (B, n_heads, T, head_dim) ------------------ #
+        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        # ---- Apply RoPE to Q and K --------------------------------- #
+        cos, sin = self.rope.get(T)                    # (T, head_dim)
+        q, k     = apply_rope(q, k, cos, sin)
+        # ---- Scaled dot-product attention (Flash Attention) -------- #
+        # is_causal=True handles the causal mask internally — no mask alloc.
+        # dropout_p only applies during training.
+        attn_out = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask   = None,
+            dropout_p   = self.attn_dropout if self.training else 0.0,
+            is_causal   = True,
+        )                                              # (B, n_heads, T, head_dim)
+        # ---- Merge heads ------------------------------------------- #
+        # contiguous() needed before view after transpose
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, T, C)
+        # ---- Output projection ------------------------------------- #
+        return self.o_proj(attn_out)                   # (B, T, d_model)
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    from model.config import SLLM_100M
+    cfg  = SLLM_100M
+    attn = CausalSelfAttention(cfg)
+    print(f"Attention params : {sum(p.numel() for p in attn.parameters())/1e6:.2f}M")
+    B, T = 2, 64
+    x   = torch.randn(B, T, cfg.d_model)
+    out = attn(x)
+    print(f"Input  shape : {x.shape}")
+    print(f"Output shape : {out.shape}")
+    assert out.shape == (B, T, cfg.d_model), "Shape mismatch!"
+    print("PASS")

model/block.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+model/block.py
+Single Transformer Block (pre-norm LLaMA-style).
+Pre-Norm vs Post-Norm:
+    GPT-2 (post-norm):  x = x + Attention(LayerNorm(x))   <- less stable
+    LLaMA (pre-norm):   x = LayerNorm(x); x = x + Attention(x)  <- more stable
+    We use PRE-NORM with RMSNorm for training stability at scale.
+Block structure:
+    x  ->  RMSNorm  ->  CausalSelfAttention  ->  (+residual)
+       ->  RMSNorm  ->  SwiGLU MLP            ->  (+residual)
+       ->  output
+Note: Residual connections bypass both norm and sublayer, which allows
+gradients to flow directly to earlier layers during backprop.
+"""
+import torch
+import torch.nn as nn
+from model.config    import ModelConfig
+from model.norm      import RMSNorm
+from model.attention import CausalSelfAttention
+from model.mlp       import SwiGLU
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        # Pre-attention norm
+        self.norm_attn = RMSNorm(config.d_model)
+        # Causal self-attention with RoPE
+        self.attn      = CausalSelfAttention(config)
+        # Pre-FFN norm
+        self.norm_mlp  = RMSNorm(config.d_model)
+        # SwiGLU feed-forward
+        self.mlp       = SwiGLU(config)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x : (B, T, d_model)
+        Returns:
+            x : (B, T, d_model)
+        """
+        # Attention sub-layer with residual
+        x = x + self.attn(self.norm_attn(x))
+        # FFN sub-layer with residual
+        x = x + self.mlp(self.norm_mlp(x))
+        return x
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    from model.config import SLLM_100M
+    cfg   = SLLM_100M
+    block = TransformerBlock(cfg)
+    n = sum(p.numel() for p in block.parameters())
+    print(f"Block params : {n/1e6:.3f}M")
+    B, T = 2, 64
+    x   = torch.randn(B, T, cfg.d_model)
+    out = block(x)
+    print(f"Input  shape : {x.shape}")
+    print(f"Output shape : {out.shape}")
+    assert out.shape == x.shape, "Shape mismatch!"
+    print("PASS")

model/config.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+model/config.py
+ModelConfig dataclass + preset configs for SLLM-100M and SLLM-150M.
+All hyperparameters live here so every other module imports from one place.
+"""
+from dataclasses import dataclass, field
+def _swiglu_d_ff(d_model: int) -> int:
+    """
+    SwiGLU hidden dimension.
+    LLaMA formula: round_up_256( int(2/3 * 4 * d_model) )
+    """
+    raw = int(2 / 3 * 4 * d_model)
+    return ((raw + 255) // 256) * 256          # round up to nearest 256
+@dataclass
+class ModelConfig:
+    # ---- Vocabulary ------------------------------------------------- #
+    vocab_size: int     = 32_000               # must match trained tokenizer
+    # ---- Sequence --------------------------------------------------- #
+    context_length: int = 1024                 # max tokens per sequence
+    # ---- Transformer dimensions ------------------------------------- #
+    d_model: int        = 768                  # embedding / hidden dim
+    n_heads: int        = 12                   # number of attention heads
+    n_layers: int       = 12                   # number of transformer blocks
+    # ---- FFN -------------------------------------------------------- #
+    # SwiGLU d_ff is auto-computed from d_model if not set explicitly
+    d_ff: int           = 0                    # 0 = auto
+    # ---- Regularization --------------------------------------------- #
+    dropout: float      = 0.0                  # 0.0 for pre-training
+    # ---- Misc ------------------------------------------------------- #
+    bias: bool          = False                # no bias (cleaner, matches LLaMA)
+    rope_theta: float   = 10_000.0             # RoPE base frequency
+    def __post_init__(self):
+        # Auto-compute d_ff if not set
+        if self.d_ff == 0:
+            self.d_ff = _swiglu_d_ff(self.d_model)
+        # Sanity checks
+        assert self.d_model % self.n_heads == 0, (
+            f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
+        )
+    @property
+    def head_dim(self) -> int:
+        return self.d_model // self.n_heads
+    def count_params(self) -> int:
+        """Returns total trainable parameter count (with tied embeddings)."""
+        embed      = self.vocab_size * self.d_model
+        attn       = 4 * self.d_model * self.d_model   # Q, K, V, O
+        mlp        = 3 * self.d_model * self.d_ff      # gate, up, down
+        norms      = 2 * self.d_model                  # pre-attn + pre-mlp
+        per_block  = attn + mlp + norms
+        final_norm = self.d_model
+        return embed + self.n_layers * per_block + final_norm
+    def __repr__(self) -> str:
+        n = self.count_params()
+        return (
+            f"ModelConfig("
+            f"d={self.d_model}, h={self.n_heads}, l={self.n_layers}, "
+            f"ff={self.d_ff}, ctx={self.context_length}, "
+            f"params={n/1e6:.1f}M)"
+        )
+# ------------------------------------------------------------------ #
+#  PRESET CONFIGS
+# ------------------------------------------------------------------ #
+SLLM_100M = ModelConfig(
+    vocab_size      = 32_000,
+    context_length  = 1024,
+    d_model         = 768,
+    n_heads         = 12,
+    n_layers        = 12,
+    # d_ff auto = 2048
+)
+SLLM_150M = ModelConfig(
+    vocab_size      = 32_000,
+    context_length  = 1024,
+    d_model         = 1024,
+    n_heads         = 16,
+    n_layers        = 9,
+    # d_ff auto = 2816
+)
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    for cfg in [SLLM_100M, SLLM_150M]:
+        print(cfg)
+        print(f"  head_dim : {cfg.head_dim}")
+        print(f"  d_ff     : {cfg.d_ff}")
+        print()

model/mlp.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+model/mlp.py
+SwiGLU Feed-Forward Network — used in LLaMA, PaLM, Mistral, etc.
+Standard FFN (GPT-2):
+    out = dropout(W2 * GELU(W1 * x))
+SwiGLU FFN (LLaMA):
+    gate   = W_gate * x           # linear gate
+    up     = W_up   * x           # linear up-proj
+    hidden = SiLU(gate) * up      # element-wise gating (learned)
+    out    = W_down * hidden      # down-proj back to d_model
+SiLU (Sigmoid Linear Unit):
+    SiLU(x) = x * sigmoid(x)
+Why SwiGLU is better:
+    - The gating mechanism (SiLU(gate) * up) gives the model a learned
+      way to activate or suppress each hidden dimension independently.
+    - Empirically outperforms GELU/ReLU FFNs at the same parameter count.
+    - d_ff is set to int(2/3 * 4 * d_model) rounded to nearest 256.
+      This compensates for having 3 matrices instead of 2, keeping
+      total parameter count comparable to a standard 4x FFN.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.config import ModelConfig
+class SwiGLU(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        d_model = config.d_model
+        d_ff    = config.d_ff
+        # Three weight matrices — no bias
+        self.gate = nn.Linear(d_model, d_ff, bias=config.bias)   # gate projection
+        self.up   = nn.Linear(d_model, d_ff, bias=config.bias)   # up projection
+        self.down = nn.Linear(d_ff, d_model, bias=config.bias)   # down projection
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x : (B, T, d_model)
+        Returns:
+            out : (B, T, d_model)
+        """
+        # SiLU = x * sigmoid(x)  (also called swish)
+        # Element-wise gating: SiLU(gate) acts as a learned activation mask on up
+        return self.down(F.silu(self.gate(x)) * self.up(x))
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    from model.config import SLLM_100M
+    cfg = SLLM_100M
+    mlp = SwiGLU(cfg)
+    n_params = sum(p.numel() for p in mlp.parameters())
+    print(f"SwiGLU d_model={cfg.d_model}  d_ff={cfg.d_ff}")
+    print(f"  gate : {cfg.d_model} x {cfg.d_ff} = {cfg.d_model * cfg.d_ff:,}")
+    print(f"  up   : {cfg.d_model} x {cfg.d_ff} = {cfg.d_model * cfg.d_ff:,}")
+    print(f"  down : {cfg.d_ff} x {cfg.d_model} = {cfg.d_ff * cfg.d_model:,}")
+    print(f"  total MLP params : {n_params/1e6:.3f}M")
+    B, T = 2, 64
+    x   = torch.randn(B, T, cfg.d_model)
+    out = mlp(x)
+    print(f"Input  shape : {x.shape}")
+    print(f"Output shape : {out.shape}")
+    assert out.shape == x.shape, "Shape mismatch!"
+    print("PASS")

model/model.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+model/model.py
+SLLM — Small Language Model (decoder-only Transformer).
+Full architecture:
+    tokens  (B, T)
+      -> Embedding       (vocab_size -> d_model)
+      -> N x TransformerBlock   (attention + FFN)
+      -> Final RMSNorm
+      -> LM Head (Linear d_model -> vocab_size)   <- weight-TIED to embedding
+Weight tying:
+    The embedding matrix and the LM head output matrix share the same weights.
+    - Halves memory for the embedding/output layers.
+    - A standard practice since GPT-2 (Press & Wolf, 2016).
+Weight initialization:
+    - Embeddings: std=0.02  (GPT-2 convention)
+    - Linear layers: std=0.02
+    - Output projections (attn.o_proj, mlp.down): std = 0.02/sqrt(2*n_layers)
+      - Scaled down per GPT-2/NanoGPT: at initialization, the residual
+        stream grows as sqrt(n_layers), so we scale residual contributions down.
+Forward:
+    Returns logits (B, T, vocab_size).
+    Loss is computed externally in the training loop for flexibility.
+"""
+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from model.config import ModelConfig
+from model.norm   import RMSNorm
+from model.block  import TransformerBlock
+class SLLM(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        # ---- Token embedding --------------------------------------- #
+        self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
+        # ---- Transformer blocks ------------------------------------ #
+        self.blocks = nn.ModuleList([
+            TransformerBlock(config) for _ in range(config.n_layers)
+        ])
+        # ---- Final norm -------------------------------------------- #
+        self.norm = RMSNorm(config.d_model)
+        # ---- LM Head ----------------------------------------------- #
+        # Linear: d_model -> vocab_size, no bias
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # ---- Weight tying ------------------------------------------ #
+        # Share embedding weights with lm_head
+        self.lm_head.weight = self.token_emb.weight
+        # ---- Gradient checkpointing flag --------------------------- #
+        # Enabled via enable_gradient_checkpointing() to save VRAM
+        self._gradient_checkpointing = False
+        # ---- Initialize weights ------------------------------------ #
+        self.apply(self._init_weights)
+    def _init_weights(self, module: nn.Module):
+        """
+        Custom weight initialization.
+        - Normal(0, 0.02) for Linear and Embedding
+        - Scaled residual projections: std *= 1/sqrt(2 * n_layers)
+        """
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        # Scale down residual projections (attn output + mlp down)
+        # Accessed by name: o_proj and down
+        if isinstance(module, nn.Linear):
+            if getattr(module, '_is_residual', False):
+                scale = 0.02 / math.sqrt(2 * self.config.n_layers)
+                nn.init.normal_(module.weight, mean=0.0, std=scale)
+    def _mark_residual_projections(self):
+        """
+        Mark output projections so _init_weights can scale them.
+        Called after __init__ to tag the specific layers.
+        """
+        for block in self.blocks:
+            block.attn.o_proj._is_residual = True
+            block.mlp.down._is_residual    = True
+        self.apply(self._init_weights)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        targets: torch.Tensor = None,
+    ):
+        """
+        Args:
+            input_ids : (B, T)  — integer token IDs
+            targets   : (B, T)  — optional, for loss computation
+        Returns:
+            logits : (B, T, vocab_size)
+            loss   : scalar CrossEntropy loss if targets given, else None
+        """
+        B, T = input_ids.shape
+        assert T <= self.config.context_length, (
+            f"Sequence length {T} exceeds context_length {self.config.context_length}"
+        )
+        # ---- Embedding --------------------------------------------- #
+        x = self.token_emb(input_ids)          # (B, T, d_model)
+        # ---- Transformer blocks ------------------------------------ #
+        for block in self.blocks:
+            if self._gradient_checkpointing and self.training:
+                # Recompute activations during backward to save VRAM
+                # use_reentrant=False is the modern recommended API
+                x = checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+        # ---- Final norm -------------------------------------------- #
+        x = self.norm(x)                       # (B, T, d_model)
+        # ---- LM Head ----------------------------------------------- #
+        logits = self.lm_head(x)               # (B, T, vocab_size)
+        # ---- Loss -------------------------------------------------- #
+        loss = None
+        if targets is not None:
+            # Flatten for cross-entropy: (B*T, vocab_size) vs (B*T,)
+            loss = nn.functional.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                targets.view(-1),
+            )
+        return logits, loss
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int,
+        temperature: float = 1.0,
+        top_k: int = None,
+    ) -> torch.Tensor:
+        """
+        Autoregressive text generation (greedy or top-k sampling).
+        Args:
+            input_ids      : (B, T) prompt tokens
+            max_new_tokens : number of tokens to generate
+            temperature    : softmax temperature (1.0 = neutral, <1 = sharper)
+            top_k          : if set, sample from top-k tokens only
+        Returns:
+            (B, T + max_new_tokens) token IDs
+        """
+        self.eval()
+        for _ in range(max_new_tokens):
+            # Crop context if longer than max
+            ctx = input_ids
+            if ctx.shape[1] > self.config.context_length:
+                ctx = ctx[:, -self.config.context_length:]
+            # Forward pass — only need last logit
+            logits, _ = self(ctx)
+            logits = logits[:, -1, :] / temperature      # (B, vocab_size)
+            # Optional top-k filtering
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = float('-inf')
+            # Sample from distribution
+            probs     = torch.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)  # (B, 1)
+            input_ids = torch.cat([input_ids, next_token], dim=1)
+        return input_ids
+    def enable_gradient_checkpointing(self):
+        """
+        Enables gradient checkpointing to reduce VRAM usage.
+        Recomputes activations during the backward pass instead of
+        storing them — trades ~30% more compute for ~40% less memory.
+        Essential for fitting 100M+ models on 4GB VRAM.
+        """
+        self._gradient_checkpointing = True
+    def count_params(self, non_embedding: bool = False) -> int:
+        """
+        Returns parameter count.
+        Args:
+            non_embedding: if True, exclude embedding parameters
+                           (common in LLM reporting since embeddings scale
+                           with vocab size and not model capacity)
+        """
+        total = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            total -= self.token_emb.weight.numel()
+        return total
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    from model.config import SLLM_100M, SLLM_150M
+    for name, cfg in [("SLLM-100M", SLLM_100M), ("SLLM-150M", SLLM_150M)]:
+        model = SLLM(cfg)
+        total = model.count_params()
+        non_emb = model.count_params(non_embedding=True)
+        print(f"{name}")
+        print(f"  total params           : {total/1e6:.1f}M")
+        print(f"  non-embedding params   : {non_emb/1e6:.1f}M")
+        print(f"  embedding params       : {(total-non_emb)/1e6:.1f}M")
+        # Forward pass check
+        B, T = 2, 64
+        ids     = torch.randint(0, cfg.vocab_size, (B, T))
+        targets = torch.randint(0, cfg.vocab_size, (B, T))
+        logits, loss = model(ids, targets)
+        print(f"  logits shape : {logits.shape}")
+        print(f"  loss         : {loss.item():.4f}")
+        print()

model/norm.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+model/norm.py
+RMSNorm — Root Mean Square Layer Normalization.
+Used in LLaMA-style transformers instead of standard LayerNorm.
+Key difference from LayerNorm:
+  - No mean subtraction (centering)
+  - No bias term
+  - Only re-scales with a single learned gain vector (weight)
+  - ~40% faster in practice (no mean computation)
+Formula:
+    RMSNorm(x) = x / RMS(x) * weight
+    where RMS(x) = sqrt( mean(x^2) + eps )
+"""
+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-6):
+        """
+        Args:
+            d_model : hidden dimension (size of last axis of input)
+            eps     : small constant for numerical stability
+        """
+        super().__init__()
+        self.eps    = eps
+        self.weight = nn.Parameter(torch.ones(d_model))   # learnable gain
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (..., d_model)
+        # compute RMS along last dimension, keepdim for broadcasting
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # cast to float32 for stable norm, then back to input dtype
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    B, T, D = 2, 16, 768
+    x    = torch.randn(B, T, D)
+    norm = RMSNorm(D)
+    out = norm(x)
+    print(f"Input  shape : {x.shape}")
+    print(f"Output shape : {out.shape}")
+    print(f"Output dtype : {out.dtype}")
+    # Verify: each vector should be approximately unit RMS after norm (before weight)
+    rms_before = x.pow(2).mean(dim=-1).sqrt()
+    rms_after  = out.pow(2).mean(dim=-1).sqrt()
+    print(f"RMS before norm : {rms_before.mean():.3f}")
+    print(f"RMS after  norm : {rms_after.mean():.3f}  (weight=1 so should be ~1.0)")
+    print("PASS" if torch.allclose(rms_after, torch.ones_like(rms_after), atol=1e-4) else "FAIL")

model/rope.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+model/rope.py
+Rotary Position Embedding (RoPE) — Su et al. 2021 (RoFormer).
+Used in LLaMA, Mistral, Gemma, etc.
+Core idea:
+    Instead of adding position embeddings to token vectors, we ROTATE
+    the query and key vectors in attention using position-dependent angles.
+    - Relative positions are encoded implicitly via dot-product invariance.
+    - Works for any sequence length (extrapolates beyond training length).
+    - Only applied to Q and K, NOT V.
+Implementation:
+    1. Precompute cos/sin tables for all positions up to max_seq_len.
+       Shape: (max_seq_len, head_dim)
+    2. At forward time, slice cos/sin to the current seq_len and
+       apply rotation to Q and K.
+Rotation formula (pairs of dims):
+    Given a vector x with dims [x0, x1, x2, x3, ...]:
+    Pair each consecutive two dims:  (x0,x1), (x2,x3), ...
+    Rotate each pair by angle theta_i * position:
+        [x0*cos - x1*sin,  x0*sin + x1*cos, ...]
+    Equivalent implementation using rotate_half:
+        rotated = concat([-x_second_half, x_first_half])  # swapped halves
+        out = x * cos + rotated * sin
+"""
+import torch
+import torch.nn as nn
+from typing import Tuple
+def precompute_rope_freqs(
+    head_dim: int,
+    max_seq_len: int,
+    theta: float = 10_000.0,
+    device: torch.device = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precompute RoPE cosine and sine tables.
+    Args:
+        head_dim    : dimension of each attention head (must be even)
+        max_seq_len : max sequence length to precompute
+        theta       : RoPE base frequency (default 10_000, use 500_000 for long context)
+        device      : torch device
+    Returns:
+        cos : (max_seq_len, head_dim)
+        sin : (max_seq_len, head_dim)
+    """
+    assert head_dim % 2 == 0, f"head_dim must be even, got {head_dim}"
+    # Inverse frequencies: shape (head_dim // 2,)
+    # inv_freq[i] = 1 / theta^(2i / head_dim)
+    i        = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+    inv_freq = 1.0 / (theta ** (i / head_dim))
+    # Position indices: shape (max_seq_len,)
+    positions = torch.arange(max_seq_len, dtype=torch.float32, device=device)
+    # Outer product: (max_seq_len, head_dim // 2)
+    freqs = torch.outer(positions, inv_freq)
+    # Duplicate along last dim to match head_dim:
+    # (max_seq_len, head_dim // 2) -> (max_seq_len, head_dim)
+    # cos/sin applied to [x0,x1,x2,x3,...] as [theta0,theta0, theta1,theta1, ...]
+    freqs = torch.cat([freqs, freqs], dim=-1)
+    return freqs.cos(), freqs.sin()
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates pairs of dimensions in the last axis.
+    Splits last dim in half, negates the second half, then swaps:
+        [x0..xN/2, xN/2..xN]  ->  [-xN/2..xN, x0..xN/2]
+    Args:
+        x: (..., head_dim)
+    Returns:
+        rotated: (..., head_dim)
+    """
+    half = x.shape[-1] // 2
+    x1 = x[..., :half]     # first half
+    x2 = x[..., half:]     # second half
+    return torch.cat([-x2, x1], dim=-1)
+def apply_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply RoPE rotation to query and key tensors.
+    Args:
+        q   : (B, n_heads, T, head_dim)
+        k   : (B, n_heads, T, head_dim)
+        cos : (T, head_dim)  - precomputed from precompute_rope_freqs
+        sin : (T, head_dim)  - precomputed from precompute_rope_freqs
+    Returns:
+        q_rot, k_rot : same shapes as inputs
+    """
+    # Broadcast cos/sin from (T, head_dim) to (1, 1, T, head_dim)
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    q_rot = (q * cos) + (rotate_half(q) * sin)
+    k_rot = (k * cos) + (rotate_half(k) * sin)
+    return q_rot, k_rot
+class RoPECache(nn.Module):
+    """
+    Module that holds the RoPE cos/sin cache as a buffer.
+    Not a learnable module — just stores precomputed freqs and moves them
+    to the right device automatically via register_buffer.
+    """
+    def __init__(self, head_dim: int, max_seq_len: int, theta: float = 10_000.0):
+        super().__init__()
+        cos, sin = precompute_rope_freqs(head_dim, max_seq_len, theta)
+        # register_buffer: not a parameter, but moves with .to(device)
+        self.register_buffer("cos", cos, persistent=True)
+        self.register_buffer("sin", sin, persistent=True)
+    def get(self, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Slice cos/sin to current sequence length."""
+        return self.cos[:seq_len], self.sin[:seq_len]
+# ------------------------------------------------------------------ #
+#  QUICK CHECK
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    B, n_heads, T, head_dim = 2, 12, 16, 64
+    cos, sin = precompute_rope_freqs(head_dim, max_seq_len=1024)
+    cos_T    = cos[:T]
+    sin_T    = sin[:T]
+    q = torch.randn(B, n_heads, T, head_dim)
+    k = torch.randn(B, n_heads, T, head_dim)
+    q_rot, k_rot = apply_rope(q, k, cos_T, sin_T)
+    print(f"q shape     : {q.shape}")
+    print(f"q_rot shape : {q_rot.shape}")
+    print(f"k_rot shape : {k_rot.shape}")
+    # Verify: rotation should preserve norm (|x| = |Rx|)
+    q_norm     = q.norm(dim=-1)
+    q_rot_norm = q_rot.norm(dim=-1)
+    print(f"Norm preserved (q): {torch.allclose(q_norm, q_rot_norm, atol=1e-5)}")
+    # Test RoPECache
+    cache = RoPECache(head_dim=64, max_seq_len=1024)
+    c, s  = cache.get(T)
+    print(f"Cache cos shape: {c.shape}")
+    print("PASS")

model_explained.md ADDED Viewed

	@@ -0,0 +1,376 @@

+# Model Folder — Plain Language Explanation
+The `model/` folder builds a **GPT-style decoder-only transformer** from scratch,
+piece by piece. Each file is one component. Here's how they stack:
+```
+tokens (integers)
+    │
+    ▼
+┌─────────────┐
+│  Embedding  │  config.py defines the shape of everything
+└──────┬──────┘
+       │
+       ▼  ×N layers
+┌──────────────────────────────────────┐
+│         TransformerBlock             │  block.py
+│                                      │
+│   ┌──────────┐    ┌──────────────┐   │
+│   │ RMSNorm  │    │  RMSNorm     │   │  norm.py
+│   └────┬─────┘    └──────┬───────┘   │
+│        │                 │           │
+│   ┌────▼─────┐    ┌──────▼───────┐   │
+│   │Attention │    │  SwiGLU MLP  │   │  attention.py / mlp.py
+│   │  + RoPE  │    │              │   │  rope.py
+│   └────┬─────┘    └──────┬───────┘   │
+│        │  (+residual)    │ (+residual)│
+└────────┼─────────────────┼───────────┘
+         │                 │
+         └────────┬────────┘
+                  │
+                  ▼
+           ┌──────────┐
+           │ RMSNorm  │  final norm
+           └────┬─────┘
+                │
+           ┌────▼─────┐
+           │  LM Head │  Linear → vocab_size logits
+           └──────────┘
+```
+---
+## 1. `config.py` — The Blueprint
+**What it does:** Stores all the numbers that define the model size.
+Nothing computes anything here — it's just a settings object.
+```python
+@dataclass
+class ModelConfig:
+    vocab_size     = 32_000   # how many tokens exist
+    context_length = 1024     # max sequence length
+    d_model        = 1024     # width of every vector throughout the model
+    n_heads        = 16       # how many attention heads
+    n_layers       = 9        # how many transformer blocks stacked
+    d_ff           = 2816     # width of the MLP hidden layer (auto-computed)
+```
+**Why these numbers?**
+- `d_model` is the "resolution" of the model — bigger = more expressive but more memory
+- `n_heads` splits each attention layer into parallel sub-attentions
+- `head_dim = d_model / n_heads = 64` — each head sees 64-dim slices
+- `d_ff` for SwiGLU = `round_256( 2/3 × 4 × d_model )` — compensates for having 3 matrices instead of 2
+**Presets defined here:**
+```
+SLLM_100M: d=768,  h=12, l=12  →  109.5M params
+SLLM_150M: d=1024, h=16, l=9   →  148.4M params
+```
+---
+## 2. `norm.py` — RMSNorm
+**What it does:** Normalizes vectors so they don't explode or vanish during training.
+Used before every attention and MLP layer.
+**Standard LayerNorm (GPT-2):**
+```
+1. Compute mean of x
+2. Subtract mean  (centering)
+3. Divide by std
+4. Scale by learned weight
+5. Add learned bias
+```
+**RMSNorm (LLaMA / our model):**
+```
+1. Compute RMS = sqrt( mean(x²) )   ← no mean subtraction!
+2. Divide by RMS
+3. Scale by learned weight           ← no bias!
+```
+**Why simpler is better:**
+- No mean subtraction → ~40% faster
+- No bias → fewer parameters
+- Works just as well in practice
+- LLaMA, Mistral, Gemma all use it
+```python
+# What it computes:
+output = (x / sqrt(mean(x²) + 1e-6)) * weight
+#          ↑ normalize           ↑ rescale with learned gain
+```
+The `weight` starts at all-ones (no change at init) and is learned during training.
+---
+## 3. `rope.py` — Rotary Position Embedding (RoPE)
+**The problem it solves:** Transformers have no built-in sense of position.
+Without position encoding, `"cat sat on mat"` and `"mat on sat cat"` look identical.
+**How older models solved it (GPT-2):**
+Added a fixed learned vector to each token: `token[i] += position_embedding[i]`
+Problem: can't generalize beyond the training length.
+**What RoPE does instead:**
+Instead of adding position info to token vectors, it **rotates** the Query and Key
+vectors in attention by an angle that depends on their position.
+```
+Token at position 3 → rotate Q and K by angle θ₃
+Token at position 7 → rotate Q and K by angle θ₇
+```
+When you compute attention score `Q·K`, the rotation cancels out in a way that
+encodes *relative distance* between tokens, not absolute positions.
+**Why this is better:**
+- No extra parameters (pure math, no learned table)
+- Works beyond training length (extrapolates)
+- Used in LLaMA, Mistral, GPT-4 (likely), Gemma
+**How the code works:**
+```python
+# Step 1: precompute a table of cos/sin values for every position
+cos, sin = precompute_rope_freqs(head_dim=64, max_seq_len=1024)
+# cos/sin shape: (1024, 64)
+# Step 2: at forward time, rotate Q and K
+q_rotated = q * cos + rotate_half(q) * sin
+k_rotated = k * cos + rotate_half(k) * sin
+# rotate_half(x): splits x in half, negates second half, swaps
+# [a, b, c, d] → [-c, -d, a, b]
+```
+V (values) are **not** rotated — only Q and K get position encoding.
+---
+## 4. `attention.py` — Causal Self-Attention
+**What it does:** Lets every token look at all *previous* tokens and decide
+which ones are relevant to predict the next token.
+**The full flow:**
+```
+Input x: (Batch, Tokens, d_model)
+         e.g. (2, 1024, 1024)
+    │
+    ▼
+QKV projection: one big Linear(d_model → 3×d_model)
+    │
+    ├─── Q: (2, 1024, 1024)  — "what am I looking for?"
+    ├─── K: (2, 1024, 1024)  — "what do I contain?"
+    └─── V: (2, 1024, 1024)  — "what do I send if attended to?"
+    │
+    ▼
+Reshape to heads: (2, 16_heads, 1024, 64_head_dim)
+    │
+    ▼
+Apply RoPE to Q and K  ← position encoding happens here
+    │
+    ▼
+Scaled Dot-Product Attention:
+    scores = Q @ K^T / sqrt(64)    # how much does each token attend to each other
+    mask   = causal mask            # can only look LEFT (past), not right (future)
+    weights = softmax(scores + mask)
+    out    = weights @ V            # weighted sum of values
+    │
+    ▼
+Reshape back: (2, 1024, 1024)
+    │
+    ▼
+Output projection: Linear(d_model → d_model)
+```
+**Causal mask** — this is what makes it a *language model* (predicts next token):
+```
+Position:  0  1  2  3
+Token 0:  [✓  ✗  ✗  ✗]   can only see itself
+Token 1:  [✓  ✓  ✗  ✗]   can see 0,1
+Token 2:  [✓  ✓  ✓  ✗]   can see 0,1,2
+Token 3:  [✓  ✓  ✓  ✓]   can see all
+```
+**Flash Attention:** We use `F.scaled_dot_product_attention(..., is_causal=True)`
+which is PyTorch 2.0's built-in Flash Attention — it never materializes the full
+O(T²) attention matrix in memory. Much faster and uses far less VRAM.
+---
+## 5. `mlp.py` — SwiGLU Feed-Forward Network
+**What it does:** After attention (which mixes *between* tokens), the MLP
+transforms each token *independently* — it's where most of the model's
+"knowledge" is stored.
+**Standard MLP (GPT-2):**
+```python
+out = W2 @ GELU(W1 @ x)   # 2 matrices
+```
+**SwiGLU (LLaMA / our model):**
+```python
+gate   = W_gate @ x         # linear
+up     = W_up   @ x         # linear
+hidden = SiLU(gate) * up    # element-wise gate  ← the key difference
+out    = W_down @ hidden     # 3 matrices total
+```
+**What is SiLU?**
+```
+SiLU(x) = x × sigmoid(x)
+```
+It's a smooth version of ReLU — never exactly zero, has a small negative region.
+**Why gating matters:**
+- `SiLU(gate)` acts as a learned on/off switch for each hidden dimension
+- The model learns to activate only the neurons relevant to each input
+- Empirically outperforms GELU at the same parameter count
+- Used in LLaMA, PaLM, Mistral
+**The d_ff formula:**
+```
+d_ff = round_up_256( int(2/3 × 4 × d_model) )
+For 150M: round_up_256( int(2/3 × 4 × 1024) ) = round_up_256(2730) = 2816
+```
+The `2/3` factor compensates for having 3 matrices instead of 2 — keeps
+total parameter count equal to a standard 4× FFN.
+---
+## 6. `block.py` — TransformerBlock
+**What it does:** Wraps attention + MLP into one reusable block.
+The model is just N copies of this block stacked.
+```python
+def forward(x):
+    # Attention sub-layer
+    x = x + attention( rmsnorm(x) )   # pre-norm + residual
+    # MLP sub-layer
+    x = x + mlp( rmsnorm(x) )         # pre-norm + residual
+    return x
+```
+**Two key ideas:**
+**1. Pre-norm (normalize BEFORE the sublayer):**
+```
+Pre-norm (LLaMA):   x → norm → attention → + original x
+Post-norm (GPT-2):  x → attention → + original x → norm
+```
+Pre-norm is more stable at large scale — gradients flow more cleanly.
+**2. Residual connections (`x + sublayer(x)`):**
+The output of each sublayer is *added* back to the input, not replacing it.
+This means:
+- Gradients can skip directly to earlier layers during backprop
+- The model learns *corrections* to the input, not transformations from scratch
+- Allows stacking many layers without vanishing gradients
+---
+## 7. `model.py` — SLLM (The Full Model)
+**What it does:** Assembles everything into the complete language model.
+```
+tokens: (B, T)  ← integer IDs like [423, 1829, 55, ...]
+    │
+    ▼
+token_emb: Embedding(32000 → 1024)
+    │   converts each integer to a 1024-dim vector
+    ▼
+blocks[0]: TransformerBlock   ─┐
+blocks[1]: TransformerBlock    │  9 blocks for 150M
+...                            │
+blocks[8]: TransformerBlock   ─┘
+    │
+    ▼
+norm: RMSNorm(1024)   ← final stabilization
+    │
+    ▼
+lm_head: Linear(1024 → 32000)
+    │   produces a score for each possible next token
+    ▼
+logits: (B, T, 32000)   ← unnormalized scores
+```
+**Weight tying:**
+The `token_emb` matrix and `lm_head` matrix **share the same weights**.
+```python
+self.lm_head.weight = self.token_emb.weight
+```
+- Same matrix used for: embedding lookup (input) AND output projection
+- Saves 32M parameters (32000 × 1024)
+- Works because: if token X has a similar embedding to the current hidden state,
+  it should also score highly as the next token prediction
+**Loss computation:**
+```python
+# Cross-entropy: at each position, predict the NEXT token
+# Input:  [The, cat, sat, on]   → predicts [cat, sat, on, mat]
+# targets = input shifted by 1
+loss = cross_entropy(logits.view(-1, 32000), targets.view(-1))
+```
+**Gradient checkpointing** (`enable_gradient_checkpointing()`):
+Normally PyTorch saves all intermediate activations during forward pass to use
+in backprop. For 9 layers with batch_size=2 and seq_len=1024, that's ~1.5GB.
+With gradient checkpointing:
+- Activations are **NOT saved** during forward pass
+- During backward pass, they are **recomputed on-the-fly**
+- Result: ~40% less VRAM, ~30% slower training
+- Essential for fitting 150M on a 4GB GPU
+**Weight initialization:**
+```python
+# All Linear and Embedding weights: Normal(mean=0, std=0.02)
+# Residual projections (o_proj, mlp.down): scaled down by 1/sqrt(2 × n_layers)
+```
+The residual scaling prevents the residual stream from growing too large
+at initialization when many layers add to it.
+---
+## How it all fits together — One forward pass
+```
+"The cat sat" → tokenizer → [423, 1829, 55]
+token_emb:  [423]→[0.1,-0.3,...] (1024 floats)
+            [1829]→[0.8, 0.2,...] (1024 floats)
+            [55]  →[-0.1,0.4,...] (1024 floats)
+Block 0:
+  norm → Q,K,V projections → RoPE rotation → Flash Attention → output proj → + residual
+  norm → gate,up projections → SiLU(gate)*up → down proj → + residual
+Block 1..8: same
+Final norm → LM head → 32000 scores per position
+softmax → probabilities → sample next token
+```
+**Total parameters (150M):**
+```
+Embedding:   32000 × 1024          =  32.8M
+Per block:   attn(4.2M) + mlp(8.6M) + norms(~0M)  =  12.85M
+9 blocks:    9 × 12.85M            = 115.6M
+Final norm:  1024                  = ~0M
+LM head:     TIED to embedding     =   0M  (reuses same weights)
+─────────────────────────────────────────
+TOTAL:       148.4M params
+```

plot_training.py ADDED Viewed

	@@ -0,0 +1,370 @@

+"""
+plot_training.py — Training Visualization Dashboard
+Reads train_log.jsonl and renders a clean, dark-mode training dashboard.
+Usage:
+  # Static plot of completed/current run
+  python plot_training.py --run_dir runs/run_001
+  # Live mode: refresh every 5 seconds while training runs
+  python plot_training.py --run_dir runs/run_001 --live
+  # Compare multiple runs
+  python plot_training.py --run_dir runs/run_001 runs/run_002
+Dashboard panels:
+  1. Training Loss          (raw + EMA smoothed)
+  2. Validation Loss        (if available)
+  3. Learning Rate schedule
+  4. Tokens / second        (throughput)
+  5. VRAM usage             (if logged)
+  6. Gradient norm          (if logged)
+"""
+import os
+import sys
+import json
+import time
+import argparse
+from pathlib import Path
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import matplotlib.ticker as ticker
+import numpy as np
+# ------------------------------------------------------------------ #
+#  STYLE
+# ------------------------------------------------------------------ #
+DARK_BG      = "#0d1117"
+PANEL_BG     = "#161b22"
+GRID_COLOR   = "#21262d"
+TEXT_COLOR   = "#c9d1d9"
+MUTED_COLOR  = "#6e7681"
+ACCENT_BLUE  = "#58a6ff"
+ACCENT_GREEN = "#3fb950"
+ACCENT_ORANGE= "#d29922"
+ACCENT_RED   = "#f85149"
+ACCENT_PURPLE= "#bc8cff"
+ACCENT_TEAL  = "#39d353"
+matplotlib.rcParams.update({
+    "figure.facecolor":  DARK_BG,
+    "axes.facecolor":    PANEL_BG,
+    "axes.edgecolor":    GRID_COLOR,
+    "axes.labelcolor":   TEXT_COLOR,
+    "axes.titlecolor":   TEXT_COLOR,
+    "xtick.color":       MUTED_COLOR,
+    "ytick.color":       MUTED_COLOR,
+    "grid.color":        GRID_COLOR,
+    "grid.linestyle":    "--",
+    "grid.linewidth":    0.5,
+    "grid.alpha":        0.7,
+    "legend.facecolor":  PANEL_BG,
+    "legend.edgecolor":  GRID_COLOR,
+    "legend.labelcolor": TEXT_COLOR,
+    "text.color":        TEXT_COLOR,
+    "font.family":       "DejaVu Sans",
+    "font.size":         10,
+    "axes.titlesize":    11,
+    "axes.labelsize":    10,
+})
+# ------------------------------------------------------------------ #
+#  DATA LOADING
+# ------------------------------------------------------------------ #
+def load_log(log_path: str) -> dict:
+    """
+    Loads train_log.jsonl and returns separate arrays for each metric.
+    Returns dict of metric_name -> list of values, aligned by step.
+    """
+    train_steps = []
+    train_loss  = []
+    val_steps   = []
+    val_loss    = []
+    lr_steps    = []
+    lr_vals     = []
+    tok_steps   = []
+    tok_vals    = []
+    vram_steps  = []
+    vram_vals   = []
+    grad_steps  = []
+    grad_vals   = []
+    if not os.path.exists(log_path):
+        return None
+    with open(log_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                entry = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            step = entry.get("step")
+            if step is None:
+                continue
+            if "loss" in entry:
+                train_steps.append(step)
+                train_loss.append(entry["loss"])
+            if "val_loss" in entry:
+                val_steps.append(step)
+                val_loss.append(entry["val_loss"])
+            if "lr" in entry:
+                lr_steps.append(step)
+                lr_vals.append(entry["lr"])
+            if "tok_per_sec" in entry:
+                tok_steps.append(step)
+                tok_vals.append(entry["tok_per_sec"])
+            if "vram_gb" in entry:
+                vram_steps.append(step)
+                vram_vals.append(entry["vram_gb"])
+            if "grad_norm" in entry and entry["grad_norm"] is not None:
+                grad_steps.append(step)
+                grad_vals.append(entry["grad_norm"])
+    return {
+        "train": (train_steps, train_loss),
+        "val":   (val_steps,   val_loss),
+        "lr":    (lr_steps,    lr_vals),
+        "tok":   (tok_steps,   tok_vals),
+        "vram":  (vram_steps,  vram_vals),
+        "grad":  (grad_steps,  grad_vals),
+    }
+def ema_smooth(values: list, alpha: float = 0.9) -> list:
+    """Exponential moving average smoothing."""
+    if not values:
+        return values
+    smoothed = [values[0]]
+    for v in values[1:]:
+        smoothed.append(alpha * smoothed[-1] + (1 - alpha) * v)
+    return smoothed
+# ------------------------------------------------------------------ #
+#  PLOTTING
+# ------------------------------------------------------------------ #
+def make_dashboard(data_dict: dict, run_names: list, save_path: str = None):
+    """
+    Renders a multi-panel training dashboard.
+    Args:
+        data_dict  : dict of run_name -> metrics dict
+        run_names  : list of run display names
+        save_path  : if set, saves figure to this path instead of showing
+    """
+    fig = plt.figure(figsize=(16, 10), facecolor=DARK_BG)
+    fig.suptitle(
+        "SLLM  Training Dashboard",
+        fontsize=16,
+        fontweight="bold",
+        color=TEXT_COLOR,
+        y=0.98,
+    )
+    # 3x2 grid of panels
+    gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.45, wspace=0.3,
+                           left=0.06, right=0.97, top=0.93, bottom=0.06)
+    ax_loss    = fig.add_subplot(gs[0, 0])
+    ax_val     = fig.add_subplot(gs[0, 1])
+    ax_lr      = fig.add_subplot(gs[1, 0])
+    ax_tok     = fig.add_subplot(gs[1, 1])
+    ax_vram    = fig.add_subplot(gs[2, 0])
+    ax_grad    = fig.add_subplot(gs[2, 1])
+    colors = [ACCENT_BLUE, ACCENT_GREEN, ACCENT_ORANGE, ACCENT_PURPLE]
+    has_val  = False
+    has_vram = False
+    has_grad = False
+    for idx, (run_name, data) in enumerate(data_dict.items()):
+        if data is None:
+            continue
+        color = colors[idx % len(colors)]
+        # --- Train loss ------------------------------------------ #
+        steps, loss = data["train"]
+        if steps:
+            smoothed = ema_smooth(loss, alpha=0.92)
+            ax_loss.plot(steps, loss,     color=color, alpha=0.25, linewidth=0.8)
+            ax_loss.plot(steps, smoothed, color=color, alpha=1.0,  linewidth=1.8,
+                         label=run_name)
+            # Annotate final loss
+            ax_loss.annotate(
+                f"{smoothed[-1]:.4f}",
+                xy=(steps[-1], smoothed[-1]),
+                xytext=(5, 0), textcoords="offset points",
+                color=color, fontsize=8, va="center",
+            )
+        # --- Val loss -------------------------------------------- #
+        vsteps, vloss = data["val"]
+        if vsteps:
+            has_val = True
+            ax_val.plot(vsteps, vloss, color=color, linewidth=2, marker="o",
+                        markersize=4, label=run_name)
+            ax_val.annotate(
+                f"{vloss[-1]:.4f}",
+                xy=(vsteps[-1], vloss[-1]),
+                xytext=(5, 0), textcoords="offset points",
+                color=color, fontsize=8, va="center",
+            )
+        # --- LR -------------------------------------------------- #
+        lsteps, lvals = data["lr"]
+        if lsteps:
+            ax_lr.plot(lsteps, lvals, color=color, linewidth=1.5, label=run_name)
+        # --- Throughput ------------------------------------------ #
+        tsteps, tvals = data["tok"]
+        if tsteps:
+            avg_tok = np.mean(tvals)
+            ax_tok.plot(tsteps, tvals, color=color, alpha=0.6, linewidth=1.0)
+            ax_tok.axhline(avg_tok, color=color, linewidth=1.5, linestyle="--",
+                           label=f"{run_name} (avg {avg_tok:.0f})")
+        # --- VRAM ------------------------------------------------- #
+        vsteps2, vvals = data["vram"]
+        if vsteps2:
+            has_vram = True
+            ax_vram.plot(vsteps2, vvals, color=color, linewidth=1.5, label=run_name)
+        # --- Grad norm ------------------------------------------- #
+        gsteps, gvals = data["grad"]
+        if gsteps:
+            has_grad = True
+            smoothed_g = ema_smooth(gvals, alpha=0.85)
+            ax_grad.plot(gsteps, gvals,     color=color, alpha=0.2, linewidth=0.8)
+            ax_grad.plot(gsteps, smoothed_g, color=color, linewidth=1.5, label=run_name)
+    # --- Style panels -------------------------------------------- #
+    def _style(ax, title, xlabel, ylabel, legend=True):
+        ax.set_title(title, fontweight="bold", pad=8)
+        ax.set_xlabel(xlabel)
+        ax.set_ylabel(ylabel)
+        ax.grid(True)
+        ax.tick_params(which="both", length=3)
+        if legend and ax.get_legend_handles_labels()[0]:
+            ax.legend(fontsize=8, loc="upper right")
+    _style(ax_loss, "Training Loss (EMA smoothed)", "Step", "Loss")
+    _style(ax_lr,   "Learning Rate Schedule",       "Step", "LR")
+    _style(ax_tok,  "Throughput",                   "Step", "Tokens / sec")
+    if has_val:
+        _style(ax_val, "Validation Loss", "Step", "Val Loss")
+    else:
+        ax_val.text(0.5, 0.5, "No validation data yet",
+                    ha="center", va="center", transform=ax_val.transAxes,
+                    color=MUTED_COLOR, fontsize=11)
+        ax_val.set_title("Validation Loss", fontweight="bold", pad=8)
+    if has_vram:
+        _style(ax_vram, "VRAM Usage", "Step", "GB")
+        ax_vram.axhline(4.0, color=ACCENT_RED, linewidth=1, linestyle=":", alpha=0.6, label="4 GB limit")
+        ax_vram.legend(fontsize=8)
+    else:
+        ax_vram.text(0.5, 0.5, "No VRAM data\n(requires CUDA)", ha="center", va="center",
+                     transform=ax_vram.transAxes, color=MUTED_COLOR, fontsize=11)
+        ax_vram.set_title("VRAM Usage", fontweight="bold", pad=8)
+    if has_grad:
+        _style(ax_grad, "Gradient Norm (EMA smoothed)", "Step", "Norm")
+    else:
+        ax_grad.text(0.5, 0.5, "No gradient norm data", ha="center", va="center",
+                     transform=ax_grad.transAxes, color=MUTED_COLOR, fontsize=11)
+        ax_grad.set_title("Gradient Norm", fontweight="bold", pad=8)
+    # LR scientific notation
+    ax_lr.yaxis.set_major_formatter(ticker.ScalarFormatter(useMathText=True))
+    ax_lr.ticklabel_format(style="sci", axis="y", scilimits=(0, 0))
+    if save_path:
+        plt.savefig(save_path, dpi=150, bbox_inches="tight", facecolor=DARK_BG)
+        print(f"[PLOT] Saved to {save_path}")
+    else:
+        plt.show()
+# ------------------------------------------------------------------ #
+#  CLI
+# ------------------------------------------------------------------ #
+def parse_args():
+    p = argparse.ArgumentParser(description="SLLM Training Dashboard")
+    p.add_argument("--run_dir",  nargs="+", default=["runs/run_001"],
+                   help="One or more run directories to plot")
+    p.add_argument("--live",     action="store_true",
+                   help="Refresh plot every --interval seconds (live mode)")
+    p.add_argument("--interval", type=int, default=10,
+                   help="Refresh interval in seconds for --live mode")
+    p.add_argument("--save",     type=str, default=None,
+                   help="Save plot to this path instead of showing interactively")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    run_dirs  = args.run_dir
+    run_names = [Path(d).name for d in run_dirs]
+    def _reload_and_plot():
+        data_dict = {}
+        for name, run_dir in zip(run_names, run_dirs):
+            log_path = os.path.join(run_dir, "train_log.jsonl")
+            data = load_log(log_path)
+            if data is None:
+                print(f"[WARN] No log found at: {log_path}")
+            data_dict[name] = data
+        # Check if any data was loaded
+        total_steps = sum(
+            len(d["train"][0]) for d in data_dict.values() if d
+        )
+        if total_steps == 0:
+            print("[PLOT] No data logged yet. Waiting...")
+            return
+        steps_info = {n: len(d["train"][0]) for n, d in data_dict.items() if d}
+        print(f"[PLOT] Plotting {steps_info} train steps")
+        plt.close("all")
+        make_dashboard(data_dict, run_names, save_path=args.save)
+    if args.live:
+        print(f"[LIVE] Refreshing every {args.interval}s  (Ctrl+C to stop)")
+        matplotlib.use("TkAgg") if sys.platform == "win32" else None
+        try:
+            while True:
+                _reload_and_plot()
+                plt.pause(args.interval)
+        except KeyboardInterrupt:
+            print("\n[LIVE] Stopped.")
+    else:
+        _reload_and_plot()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# requirements.txt — SLLM project
+# Install into the 'pytorch' conda env:
+#   conda run -n pytorch pip install -r requirements.txt
+# Core ML
+torch>=2.3.0
+torchvision
+# Data
+datasets>=2.14.0        # HuggingFace datasets (streaming)
+tokenizers>=0.15.0      # fast BPE tokenizer
+transformers>=4.40.0    # PreTrainedTokenizerFast
+# Utilities
+numpy>=1.26.0
+tqdm
+matplotlib              # training plots
+rich                    # pretty terminal output (optional)
+# Dev

run.md ADDED Viewed

	@@ -0,0 +1,34 @@

+Start training (first run):
+python train.py ^
+  --config 150M ^
+  --data_dir tokenizer/data ^
+  --batch_size 2 ^
+  --grad_accum 16 ^
+  --grad_checkpoint ^
+  --dtype bf16 ^
+  --max_steps 5000 ^
+  --run_dir runs/sllm_150m ^
+  --log_every 10 ^
+  --save_every 500 ^
+  --val_every 500 ^
+  --val_steps 20 ^
+  --warmup_steps 200
+Resume from where you stopped:
+python train.py --resume --data_dir tokenizer/data --batch_size 2 --grad_accum 16 --grad_checkpoint --dtype bf16 --extra_steps 5000 --run_dir runs/sllm_150m --log_every 10 --save_every 500 --val_every 500 --val_steps 20 --warmup_steps 200
+Plot while training (in a second terminal):
+conda activate pytorch
+cd c:\geetesh\aimldl\projects\sllm
+python plot_training.py --run_dir runs/sllm_150m --live --interval 30
+python finetune/prepare_data.py
+python finetune/sft_train.py --base_ckpt runs/sllm_150m/ckpt_0011500.pt --run_dir runs/sllm_150m_chat --max_steps 2500 --batch_size 4 --grad_accum 8 --grad_checkpoint
+python finetune/chat.py --run_dir runs/sllm_150m_chat

test_chatmodel.py ADDED Viewed

	@@ -0,0 +1,366 @@

+"""
+test_chatmodel.py — Interactive CLI chat and evaluation for the fine-tuned SLLM chat model.
+Usage:
+    python test_chatmodel.py --run_dir runs/sllm_150m_chat
+    python test_chatmodel.py --run_dir runs/sllm_150m_chat --mode sample
+In interactive mode:
+    Type your message and press Enter.
+    Special commands:
+        /reset          Clear conversation history
+        /system <text>  Change the system prompt
+        /quit           Exit the chat
+"""
+import os
+import sys
+import argparse
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.amp import autocast
+from transformers import PreTrainedTokenizerFast
+# Add project root to path
+PROJECT_ROOT = Path(__file__).resolve().parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from model.config import SLLM_150M
+from model.model  import SLLM
+DEFAULT_SYSTEM  = "You are a helpful, concise assistant."
+DEFAULT_RUN_DIR = str(PROJECT_ROOT / "runs" / "sllm_150m_chat")
+# ------------------------------------------------------------------ #
+#  HELPERS
+# ------------------------------------------------------------------ #
+def find_latest_ckpt(run_dir: str) -> str:
+    """Returns path to the most recent SFT or base checkpoint in run_dir."""
+    if not os.path.isdir(run_dir):
+        raise FileNotFoundError(f"Run directory '{run_dir}' does not exist.")
+    ckpts = sorted([
+        f for f in os.listdir(run_dir)
+        if (f.startswith("ckpt_sft_") or f.startswith("ckpt_")) and f.endswith(".pt")
+    ])
+    if not ckpts:
+        raise FileNotFoundError(
+            f"No checkpoints found in '{run_dir}'.\n"
+            f"Please ensure you have trained the model or point to the correct folder."
+        )
+    return os.path.join(run_dir, ckpts[-1])
+def resize_token_embeddings(model: SLLM, new_vocab_size: int):
+    """Resizes the token embeddings matrix to support added special tokens."""
+    old_size = model.config.vocab_size
+    if new_vocab_size == old_size:
+        return
+    d_model    = model.config.d_model
+    device     = model.token_emb.weight.device
+    dtype      = model.token_emb.weight.dtype
+    old_weight = model.token_emb.weight.data.clone()
+    mean_vec   = old_weight.mean(dim=0)
+    new_weight = torch.zeros(new_vocab_size, d_model, dtype=dtype, device=device)
+    new_weight[:old_size] = old_weight
+    new_weight[old_size:] = mean_vec.unsqueeze(0).expand(new_vocab_size - old_size, -1)
+    new_emb = nn.Embedding(new_vocab_size, d_model).to(device=device, dtype=dtype)
+    new_emb.weight.data = new_weight
+    model.token_emb = new_emb
+    model.lm_head.weight = model.token_emb.weight
+    model.config.vocab_size = new_vocab_size
+    print(f"  [INFO] Resized model vocab embedding from {old_size:,} to {new_vocab_size:,}")
+def load_model_and_tokenizer(run_dir: str, device: torch.device):
+    """Loads tokenizer and the latest model checkpoint."""
+    # ---- Tokenizer ------------------------------------------------- #
+    # Look in finetune/data or tokenizer/fineweb_edu_tokenizer
+    data_tok_dir = PROJECT_ROOT / "finetune" / "data"
+    base_tok_dir = PROJECT_ROOT / "tokenizer" / "fineweb_edu_tokenizer"
+    if os.path.exists(data_tok_dir / "tokenizer.json"):
+        tok_path = str(data_tok_dir)
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(tok_path)
+        print(f"  Tokenizer: Loaded extended tokenizer from '{tok_path}'")
+    elif os.path.exists(base_tok_dir):
+        tok_path = str(base_tok_dir)
+        tokenizer = PreTrainedTokenizerFast.from_pretrained(tok_path)
+        tokenizer.add_special_tokens({
+            "additional_special_tokens": ["<|im_start|>", "<|im_end|>"]
+        })
+        print(f"  Tokenizer: Loaded base tokenizer from '{tok_path}' and added ChatML tokens")
+    else:
+        raise FileNotFoundError("Could not find a tokenizer directory.")
+    # ---- Checkpoint ------------------------------------------------ #
+    try:
+        ckpt_path = find_latest_ckpt(run_dir)
+    except FileNotFoundError:
+        # Fall back to base pretraining checkpoint if SFT directory is empty
+        print(f"  [WARN] No checkpoint found in '{run_dir}'. Trying pretraining base run...")
+        base_dir = PROJECT_ROOT / "runs" / "sllm_150m"
+        ckpt_path = find_latest_ckpt(str(base_dir))
+    print(f"  Loading checkpoint: {ckpt_path}")
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    # ---- Model ----------------------------------------------------- #
+    model = SLLM(SLLM_150M).to(device)
+    saved_vocab = ckpt.get("vocab_size", len(tokenizer))
+    resize_token_embeddings(model, saved_vocab)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model.eval()
+    step = ckpt.get("step", "?")
+    loss = ckpt.get("loss", float("nan"))
+    return model, tokenizer, ckpt_path, step, loss
+# ------------------------------------------------------------------ #
+#  PROMPT BUILDING
+# ------------------------------------------------------------------ #
+def build_prompt(history: list[dict], system_prompt: str,
+                 tokenizer: PreTrainedTokenizerFast) -> torch.Tensor:
+    """Formats conversation history as ChatML and tokenizes it."""
+    text = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    for turn in history:
+        text += f"<|im_start|>{turn['role']}\n{turn['content']}<|im_end|>\n"
+    # Prime the model to respond as assistant
+    text += "<|im_start|>assistant\n"
+    ids = tokenizer.encode(text, add_special_tokens=False)
+    return torch.tensor([ids], dtype=torch.long)
+# ------------------------------------------------------------------ #
+#  GENERATION
+# ------------------------------------------------------------------ #
+@torch.no_grad()
+def generate_response(
+    model:          SLLM,
+    input_ids:      torch.Tensor,
+    tokenizer:      PreTrainedTokenizerFast,
+    max_new_tokens: int   = 200,
+    temperature:    float = 0.7,
+    top_k:          int   = 40,
+    top_p:          float = 0.9,
+    device:         torch.device = None,
+    dtype_torch:    torch.dtype = torch.float32,
+    use_amp:        bool = False,
+) -> str:
+    """Generates a response from the model using top-k/top-p sampling."""
+    im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    eos_id    = tokenizer.eos_token_id
+    ids       = input_ids.to(device)
+    generated = []
+    for _ in range(max_new_tokens):
+        # Crop context to model window
+        ctx = ids if ids.shape[1] <= model.config.context_length \
+                  else ids[:, -model.config.context_length:]
+        with autocast(device_type=device.type, dtype=dtype_torch, enabled=use_amp):
+            logits, _ = model(ctx)                       # (1, T, V)
+        # Pull last token logits
+        logits = logits[:, -1, :]
+        if temperature == 0.0:
+            # Greedy
+            next_token = logits.argmax(dim=-1, keepdim=True)
+        else:
+            logits = logits / max(temperature, 1e-8)
+            # Top-k filtering
+            if top_k and top_k > 0:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = float("-inf")
+            # Top-p (nucleus) filtering
+            if top_p < 1.0:
+                sorted_logits, sorted_idx = torch.sort(logits, descending=True)
+                cumprobs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_logits[cumprobs - torch.softmax(sorted_logits, dim=-1) > top_p] = float("-inf")
+                logits = torch.zeros_like(logits).scatter_(1, sorted_idx, sorted_logits)
+            probs      = torch.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)   # (1, 1)
+        tok_id = next_token.item()
+        # Stop if end of message or end of stream token is generated
+        if tok_id == im_end_id or tok_id == eos_id:
+            break
+        generated.append(tok_id)
+        ids = torch.cat([ids, next_token], dim=1)
+    return tokenizer.decode(generated, skip_special_tokens=True).strip()
+# ------------------------------------------------------------------ #
+#  MODES
+# ------------------------------------------------------------------ #
+def run_interactive(model, tokenizer, device, dtype_torch, use_amp, args):
+    system_prompt = args.system
+    history = []
+    print("\n" + "=" * 60)
+    print("  CHAT MODE (Interactive)")
+    print("=" * 60)
+    print(f"  System prompt : {system_prompt}")
+    print("  Commands      : /reset to clear memory | /system <prompt> | /quit to exit")
+    print("─" * 60 + "\n")
+    while True:
+        try:
+            user_input = input("You: ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print("\nBye!")
+            break
+        if not user_input:
+            continue
+        # Check for commands
+        if user_input.lower() in ("/quit", "/exit", "quit", "exit"):
+            print("Bye!")
+            break
+        if user_input.lower() == "/reset":
+            history = []
+            print("  [Conversation history reset]\n")
+            continue
+        if user_input.lower().startswith("/system "):
+            new_sys = user_input[8:].strip()
+            if new_sys:
+                system_prompt = new_sys
+                history = []
+                print(f"  [System prompt updated. History cleared.]\n")
+            continue
+        # Add to history and build ChatML prompt
+        history.append({"role": "user", "content": user_input})
+        input_ids = build_prompt(history, system_prompt, tokenizer)
+        # Trim conversation window if it exceeds model context length
+        while input_ids.shape[1] > model.config.context_length - args.max_new_tokens - 10:
+            if len(history) > 2:
+                history = history[2:]  # Remove oldest user + assistant turn
+                input_ids = build_prompt(history, system_prompt, tokenizer)
+            else:
+                break
+        print("SLLM: ", end="", flush=True)
+        response = generate_response(
+            model, input_ids, tokenizer,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            device=device,
+            dtype_torch=dtype_torch,
+            use_amp=use_amp,
+        )
+        print(response + "\n")
+        history.append({"role": "assistant", "content": response})
+def run_sample(model, tokenizer, device, dtype_torch, use_amp, args):
+    sample_prompts = [
+        "Hello! Who are you?",
+        "What is the capital of France?",
+        "Write a quick, 3-line poem about a small robot learning to speak.",
+        "Explain gravity in one simple sentence.",
+    ]
+    print("\n" + "=" * 60)
+    print("  SAMPLE EVALUATION MODE")
+    print("=" * 60)
+    print(f"  System prompt: {args.system}")
+    print("─" * 60)
+    for prompt in sample_prompts:
+        print(f"\n[PROMPT] : {prompt}")
+        history = [{"role": "user", "content": prompt}]
+        input_ids = build_prompt(history, args.system, tokenizer)
+        print("[SLLM]   : ", end="", flush=True)
+        response = generate_response(
+            model, input_ids, tokenizer,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            device=device,
+            dtype_torch=dtype_torch,
+            use_amp=use_amp,
+        )
+        print(response)
+    print("\n" + "─" * 60 + "\n")
+# ------------------------------------------------------------------ #
+#  MAIN
+# ------------------------------------------------------------------ #
+def main():
+    p = argparse.ArgumentParser(description="SLLM Chat Checker")
+    p.add_argument("--run_dir",        type=str,   default=DEFAULT_RUN_DIR)
+    p.add_argument("--mode",           type=str,   default="interactive", choices=["interactive", "sample"])
+    p.add_argument("--temperature",    type=float, default=0.7)
+    p.add_argument("--top_k",          type=int,   default=40)
+    p.add_argument("--top_p",          type=float, default=0.9)
+    p.add_argument("--max_new_tokens", type=int,   default=200)
+    p.add_argument("--system",         type=str,   default=DEFAULT_SYSTEM)
+    p.add_argument("--dtype",          type=str,   default="bf16", choices=["fp32", "fp16", "bf16"])
+    args = p.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\nDevice : {device}")
+    if device.type == "cuda":
+        print(f"GPU    : {torch.cuda.get_device_name(0)}")
+    # Precision setup
+    use_amp = False
+    if args.dtype == "bf16" and device.type == "cuda" and torch.cuda.is_bf16_supported():
+        dtype_torch = torch.bfloat16
+        use_amp     = True
+    elif args.dtype == "fp16" and device.type == "cuda":
+        dtype_torch = torch.float16
+        use_amp     = True
+    else:
+        dtype_torch = torch.float32
+    print(f"dtype  : {args.dtype}")
+    # Load Model and Tokenizer
+    try:
+        model, tokenizer, ckpt_path, step, loss = load_model_and_tokenizer(args.run_dir, device)
+        print(f"  Step       : {step}")
+        if not torch.isnan(torch.tensor(loss)):
+            print(f"  Loss       : {loss:.4f}")
+    except Exception as e:
+        print(f"\n[ERROR] Failed to load chat model: {e}")
+        return
+    if args.mode == "interactive":
+        run_interactive(model, tokenizer, device, dtype_torch, use_amp, args)
+    elif args.mode == "sample":
+        run_sample(model, tokenizer, device, dtype_torch, use_amp, args)
+if __name__ == "__main__":
+    main()

test_checkpoint.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+test_checkpoint.py — Load a checkpoint and run inference / inspect it.
+QUICK START: Edit the variables in the CONFIG section below, then run:
+    python test_checkpoint.py
+Modes:
+  INTERACTIVE  — Chat loop: type prompts, model responds.
+  SAMPLE       — Auto-generate N samples from fixed prompts and exit.
+  INSPECT      — Just print checkpoint info (no generation).
+"""
+import os
+import sys
+import torch
+from torch.amp import autocast
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model.config import SLLM_100M, SLLM_150M, ModelConfig
+from model.model  import SLLM
+# ================================================================== #
+#  ✏️  EDIT THESE VARIABLES
+# ================================================================== #
+# --- Checkpoint to load -------------------------------------------
+# Point to any .pt file inside a runs/ subfolder.
+# Examples:
+#   RUN_DIR   = "runs/sllm_150m"        # loads latest .pt in this folder
+#   CKPT_FILE = None                    # set to a specific filename to override
+#   CKPT_FILE = "ckpt_0002000.pt"       # or pick a specific step
+RUN_DIR   = "runs/sllm_150m"
+CKPT_FILE = None          # None = auto-pick latest checkpoint in RUN_DIR
+# --- Model config --------------------------------------------------
+# Must match what you trained with: "100M" or "150M"
+CONFIG = "150M"
+# --- Generation settings ------------------------------------------
+MAX_NEW_TOKENS = 100       # tokens to generate per prompt
+TEMPERATURE    = 0.8       # 0.0 = greedy, 1.0 = random, 0.8 = balanced
+TOP_K          = 50        # keep only top-k logits (0 = disabled)
+TOP_P          = 0.95      # nucleus sampling threshold (1.0 = disabled)
+# --- Mode ---------------------------------------------------------
+# "interactive" : chat loop in the terminal
+# "sample"      : run SAMPLE_PROMPTS list and exit
+# "inspect"     : just print checkpoint metadata, no generation
+MODE = "sample"
+# --- Prompts for SAMPLE mode --------------------------------------
+SAMPLE_PROMPTS = [
+    "Once upon a time",
+    "The meaning of life is",
+    "In the year 2050,",
+]
+# --- dtype --------------------------------------------------------
+# "bf16" (recommended on RTX cards), "fp16", or "fp32"
+DTYPE = "bf16"
+# ================================================================== #
+#  INTERNALS (no need to edit below)
+# ================================================================== #
+def resolve_checkpoint(run_dir: str, ckpt_file) -> str:
+    """Return full path to the checkpoint file."""
+    if ckpt_file is not None:
+        path = os.path.join(run_dir, ckpt_file)
+        if not os.path.isfile(path):
+            raise FileNotFoundError(f"Checkpoint not found: {path}")
+        return path
+    # Auto-pick latest
+    if not os.path.isdir(run_dir):
+        raise FileNotFoundError(f"Run directory not found: {run_dir}")
+    ckpts = sorted([
+        f for f in os.listdir(run_dir)
+        if f.startswith("ckpt_") and f.endswith(".pt")
+    ])
+    if not ckpts:
+        raise FileNotFoundError(f"No checkpoints found in: {run_dir}")
+    return os.path.join(run_dir, ckpts[-1])
+def load_model(ckpt_path: str, config_name: str, device, dtype_torch):
+    """Load model weights from checkpoint."""
+    cfg_map = {"100M": SLLM_100M, "150M": SLLM_150M}
+    cfg     = cfg_map[config_name]
+    print(f"\n  Config  : {cfg}")
+    model = SLLM(cfg).to(device)
+    ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+    # Prefer config_name stored in checkpoint (override CLI if available)
+    ckpt_cfg_name = ckpt.get("config_name", config_name)
+    if ckpt_cfg_name != config_name:
+        print(f"  [WARN] Checkpoint config_name='{ckpt_cfg_name}' "
+              f"differs from CONFIG='{config_name}'. "
+              f"Using checkpoint's config: '{ckpt_cfg_name}'")
+        cfg   = cfg_map[ckpt_cfg_name]
+        model = SLLM(cfg).to(device)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model.eval()
+    step = ckpt.get("step", "?")
+    loss = ckpt.get("loss", float("nan"))
+    return model, cfg, step, loss
+@torch.no_grad()
+def generate(model, prompt_ids: list[int], cfg: ModelConfig, device,
+             dtype_torch, use_amp: bool,
+             max_new_tokens: int, temperature: float,
+             top_k: int, top_p: float) -> list[int]:
+    """Token-by-token autoregressive generation."""
+    ids = torch.tensor([prompt_ids], dtype=torch.long, device=device)
+    ctx_len = cfg.context_length
+    for _ in range(max_new_tokens):
+        # Crop to context window
+        ids_crop = ids[:, -ctx_len:]
+        with autocast(device_type=device.type, dtype=dtype_torch, enabled=use_amp):
+            logits, _ = model(ids_crop)
+        # Logits for the last position
+        logits = logits[:, -1, :]  # (1, vocab)
+        if temperature == 0.0:
+            # Greedy
+            next_id = logits.argmax(dim=-1, keepdim=True)
+        else:
+            logits = logits / temperature
+            # Top-K filtering
+            if top_k > 0:
+                vals, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < vals[:, [-1]]] = float("-inf")
+            # Top-P (nucleus) filtering
+            if top_p < 1.0:
+                sorted_logits, sorted_idx = torch.sort(logits, descending=True)
+                cumprobs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                # Remove tokens with cumulative prob > top_p
+                sorted_logits[cumprobs - torch.softmax(sorted_logits, dim=-1) > top_p] = float("-inf")
+                logits = torch.zeros_like(logits).scatter_(1, sorted_idx, sorted_logits)
+            probs   = torch.softmax(logits, dim=-1)
+            next_id = torch.multinomial(probs, num_samples=1)
+        ids = torch.cat([ids, next_id], dim=1)
+    return ids[0].tolist()
+def char_tokenize(text: str) -> list[int]:
+    """
+    Fallback character-level tokenizer.
+    Your model uses a real tokenizer — swap this out with yours if available.
+    Each char maps to its Unicode code point (capped at vocab_size - 1).
+    """
+    return [min(ord(c), 31_999) for c in text]
+def char_detokenize(ids: list[int]) -> str:
+    """Reverse of char_tokenize."""
+    return "".join(chr(i) if 32 <= i < 127 else "?" for i in ids)
+def try_load_sentencepiece(tokenizer_dir="tokenizer/fineweb_edu_tokenizer"):
+    """Load the HuggingFace PreTrainedTokenizerFast used during training."""
+    try:
+        from transformers import PreTrainedTokenizerFast
+        tok = PreTrainedTokenizerFast.from_pretrained(tokenizer_dir)
+        encode = lambda text: tok.encode(text)
+        decode = lambda ids:  tok.decode(ids, skip_special_tokens=True)
+        print(f"  Tokenizer: HuggingFace tokenizer loaded from '{tokenizer_dir}'")
+        print(f"             vocab_size={tok.vocab_size:,}  eos_id={tok.eos_token_id}")
+        return encode, decode
+    except Exception as e:
+        print(f"  Tokenizer: Could not load HuggingFace tokenizer ({e})")
+        print("             Falling back to char tokenizer — output will be garbled!")
+        return char_tokenize, char_detokenize
+def run_interactive(model, cfg, device, dtype_torch, use_amp, encode, decode):
+    print("\n" + "="*60)
+    print("  INTERACTIVE MODE  (type 'quit' or 'exit' to stop)")
+    print("="*60)
+    print(f"  max_new_tokens : {MAX_NEW_TOKENS}")
+    print(f"  temperature    : {TEMPERATURE}")
+    print(f"  top_k / top_p  : {TOP_K} / {TOP_P}")
+    print()
+    while True:
+        try:
+            prompt = input("Prompt> ").strip()
+        except (EOFError, KeyboardInterrupt):
+            print("\n  Exiting.")
+            break
+        if prompt.lower() in ("quit", "exit", ""):
+            print("  Exiting.")
+            break
+        prompt_ids = encode(prompt)
+        output_ids = generate(
+            model, prompt_ids, cfg, device, dtype_torch, use_amp,
+            MAX_NEW_TOKENS, TEMPERATURE, TOP_K, TOP_P,
+        )
+        # Only show the newly generated tokens
+        new_ids = output_ids[len(prompt_ids):]
+        print(f"\nGenerated: {decode(new_ids)}\n")
+def run_sample(model, cfg, device, dtype_torch, use_amp, encode, decode):
+    print("\n" + "="*60)
+    print("  SAMPLE MODE")
+    print("="*60)
+    for i, prompt in enumerate(SAMPLE_PROMPTS, 1):
+        print(f"\n[{i}] Prompt : {prompt!r}")
+        prompt_ids = encode(prompt)
+        output_ids = generate(
+            model, prompt_ids, cfg, device, dtype_torch, use_amp,
+            MAX_NEW_TOKENS, TEMPERATURE, TOP_K, TOP_P,
+        )
+        new_ids = output_ids[len(prompt_ids):]
+        print(f"    Output : {decode(new_ids)}")
+def run_inspect(ckpt_path, step, loss, cfg):
+    print("\n" + "="*60)
+    print("  INSPECT MODE")
+    print("="*60)
+    print(f"  Checkpoint : {ckpt_path}")
+    print(f"  Step       : {step}")
+    print(f"  Loss       : {loss:.4f}" if isinstance(loss, float) else f"  Loss: {loss}")
+    print(f"  Config     : {cfg}")
+    print(f"  Params     : {cfg.count_params()/1e6:.1f}M")
+    print()
+def main():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\nDevice : {device}")
+    if device.type == "cuda":
+        print(f"GPU    : {torch.cuda.get_device_name(0)}")
+        print(f"VRAM   : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    # dtype setup
+    use_amp = False
+    if DTYPE == "bf16" and device.type == "cuda" and torch.cuda.is_bf16_supported():
+        dtype_torch = torch.bfloat16
+        use_amp     = True
+    elif DTYPE == "fp16" and device.type == "cuda":
+        dtype_torch = torch.float16
+        use_amp     = True
+    else:
+        dtype_torch = torch.float32
+    print(f"dtype  : {DTYPE}")
+    # Resolve checkpoint path
+    ckpt_path = resolve_checkpoint(RUN_DIR, CKPT_FILE)
+    print(f"\nCheckpoint: {ckpt_path}")
+    # Load model
+    model, cfg, step, loss = load_model(ckpt_path, CONFIG, device, dtype_torch)
+    print(f"  Loaded    : step={step}, loss={loss:.4f}")
+    print(f"  Params    : {model.count_params()/1e6:.1f}M")
+    if MODE == "inspect":
+        run_inspect(ckpt_path, step, loss, cfg)
+        return
+    # Load tokenizer
+    encode, decode = try_load_sentencepiece()
+    if MODE == "interactive":
+        run_interactive(model, cfg, device, dtype_torch, use_amp, encode, decode)
+    elif MODE == "sample":
+        run_sample(model, cfg, device, dtype_torch, use_amp, encode, decode)
+    else:
+        print(f"  [ERROR] Unknown MODE: '{MODE}'. Use 'interactive', 'sample', or 'inspect'.")
+if __name__ == "__main__":
+    main()

tokenizer/bpe.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from tokenizers import Tokenizer, AddedToken
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Sequence, ByteLevel
+from tokenizers.decoders import ByteLevel as ByteLevelDecoder
+from pretokenizer import get_pretokenizer
+VOCAB_SIZE    = 32_000
+MIN_FREQUENCY = 3
+SPECIAL_TOKENS = ["<|endoftext|>"]
+def build_tokenizer() -> Tokenizer:
+    """
+    Builds and returns an untrained tokenizer with all components configured.
+    Call .train_from_iterator() or .train() on the returned object to train it.
+    Pipeline:
+        Raw text
+            -> Normalizer     (handled externally in our normalize() fn)
+            -> Pre-tokenizer  (custom regex splits + byte level conversion)
+            -> BPE Model      (learns merge rules during training)
+            -> Decoder        (reverses byte level for human readable output)
+    """
+    # ---- 1. BPE Model ------------------------------------------------
+    # unk_token=None because byte-level means we NEVER have unknowns
+    # every character always maps to at least one byte token
+    model = BPE(
+        unk_token=None,      # no unknown token - byte fallback handles everything
+        byte_fallback=True,  # unknown chars represented as <0xXX> byte tokens
+                             # e.g. ∇ -> <0xE2><0x88><0x87>
+    )
+    tokenizer = Tokenizer(model)
+    # ---- 2. Pre-tokenizer --------------------------------------------
+    # Sequence chains two pre-tokenizers in order:
+    #
+    # Step A: Our custom regex splits text into meaningful chunks
+    #         (contractions, abbreviations, numbers, operators etc.)
+    #
+    # Step B: ByteLevel converts each chunk's characters to their
+    #         byte representation using a 256-char printable alphabet
+    #         e.g. é (bytes 0xC3 0xA9) -> "Ã©"
+    #
+    # add_prefix_space=False because our regex already handles
+    # whitespace explicitly as its own token category
+    tokenizer.pre_tokenizer = Sequence([
+        get_pretokenizer(),                        # Step A - our regex
+        ByteLevel(add_prefix_space=False),         # Step B - byte conversion
+    ])
+    # ---- 3. Decoder --------------------------------------------------
+    # Reverses the ByteLevel encoding so output is human readable
+    # Without this tokenizer.decode() would return "Ã©" instead of "é"
+    tokenizer.decoder = ByteLevelDecoder()
+    return tokenizer
+# ------------------------------------------------------------------ #
+#  TRAINER CONFIG
+# ------------------------------------------------------------------ #
+def build_trainer() -> BpeTrainer:
+    """
+    Configures the BPE trainer.
+    vocab_size breakdown:
+        256   base byte tokens (one per possible byte value, always present)
+      + 31,743 learned BPE merge tokens
+      + 1     special token (<|endoftext|>)
+      = 32,000 total
+    The trainer automatically accounts for the 256 base tokens,
+    so setting vocab_size=32_000 gives you the right final count.
+    """
+    return BpeTrainer(
+        vocab_size=VOCAB_SIZE,
+        min_frequency=MIN_FREQUENCY,
+        special_tokens=SPECIAL_TOKENS,
+        # show_progress shows a progress bar during training
+        show_progress=True,
+        # initial_alphabet tells the trainer to include all 256 bytes
+        # as base tokens before any merges happen
+        # This is what guarantees byte-level fallback works
+        initial_alphabet=ByteLevel.alphabet(),
+    )
+#  CONVENIENCE: get special token IDs after training
+def get_special_token_ids(tokenizer: Tokenizer) -> dict:
+    """
+    Returns a dict of special token string -> token ID.
+    Call this AFTER training to get the final IDs.
+    Example:
+        ids = get_special_token_ids(tokenizer)
+        eot_id = ids["<|endoftext|>"]  # typically 0
+    """
+    return {
+        token: tokenizer.token_to_id(token)
+        for token in SPECIAL_TOKENS
+    }
+#  QUICK SANITY CHECK
+if __name__ == "__main__":
+    print("Building tokenizer...")
+    tokenizer = build_tokenizer()
+    print("Building trainer...")
+    trainer = build_trainer()
+    # Verify pre-tokenizer chain is set up correctly
+    print("\nPre-tokenizer chain:")
+    print(f"  {tokenizer.pre_tokenizer}")
+    # Verify decoder is set
+    print(f"\nDecoder:")
+    print(f"  {tokenizer.decoder}")
+    # Verify trainer config
+    print(f"\nTrainer config:")
+    print(f"  vocab_size    : {trainer.vocab_size}")
+    print(f"  min_frequency : {trainer.min_frequency}")
+    print(f"  special_tokens: {trainer.special_tokens}")
+    print(f"  base alphabet : {len(ByteLevel.alphabet())} byte tokens")
+    print("\nAll good - ready to train.")
+    print("Next step: pipe FineWeb-Edu text into tokenizer.train_from_iterator()")

tokenizer/fineweb_edu_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/fineweb_edu_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>"
+}

tokenizer/fineweb_edu_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/fineweb_edu_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "tokenizer_class": "TokenizersBackend",
+  "truncation_side": "right",
+  "unk_token": null
+}

tokenizer/normalizer.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import re
+import html
+import unicodedata
+def normalization(text):
+    # Strip HTML tags (note: won't catch multiline tags)
+    text = re.sub(r'<[^>]+>', ' ', text)
+    # HTML entity decoding
+    text = html.unescape(text)
+    # NFC normalization
+    text = unicodedata.normalize('NFC', text)
+    # Control characters — including \x7f (DEL)
+    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+    # Unicode line/paragraph separators → newline (structural, not removed)
+    text = re.sub(r'[\u2028\u2029]', '\n', text)
+    # Zero-width characters
+    text = re.sub(r'[\u200b\u200c\u200d\ufeff\u00ad]', '', text)
+    # Replacement character
+    text = text.replace('\ufffd', '')
+    # Normalize line endings
+    text = text.replace('\r\n', '\n')
+    text = text.replace('\r', '\n')
+    # Collapse spaces only (preserve leading tabs for indentation)
+    text = re.sub(r' +', ' ', text)
+    # Trailing spaces/tabs at end of line
+    text = re.sub(r'[ \t]+\n', '\n', text)
+    # Collapse excess newlines
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = text.strip()
+    return text

tokenizer/post_processor.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from tokenizers.processors import TemplateProcessing
+from tokenizers import Tokenizer
+# ------------------------------------------------------------------ #
+#  POST-PROCESSOR
+#  Runs after BPE encoding, appends <|endoftext|> to every sequence
+# ------------------------------------------------------------------ #
+def add_post_processor(tokenizer: Tokenizer) -> Tokenizer:
+    """
+    Adds a post-processor to the tokenizer that appends
+    <|endoftext|> to every encoded sequence.
+    Must be called AFTER training because we need the real
+    token ID of <|endoftext|> from the trained vocab.
+    Args:
+        tokenizer: a trained Tokenizer object
+    Returns:
+        The same tokenizer with post-processor attached
+    """
+    # Get the real ID from the trained vocab
+    # This is why we can only do this after training
+    eot_id = tokenizer.token_to_id("<|endoftext|>")
+    if eot_id is None:
+        raise ValueError(
+            "<|endoftext|> not found in vocab. "
+            "Make sure the tokenizer is trained before adding post-processor."
+        )
+    # TemplateProcessing defines the final sequence structure
+    # using a simple template syntax:
+    #
+    #   $A         -> the encoded sequence (single sequence)
+    #   $A $B      -> two sequences (for pair tasks like QA)
+    #   <|endoftext|>:ID -> insert this special token with its ID
+    #
+    # Our template:
+    #   single   : [tokens...] <|endoftext|>
+    #   pair     : [tokens_A...] <|endoftext|> [tokens_B...] <|endoftext|>
+    #
+    # pair template handles future use cases like
+    # question-context pairs without needing to change the tokenizer
+    tokenizer.post_processor = TemplateProcessing(
+        single="$A <|endoftext|>:0",
+        pair="$A <|endoftext|>:0 $B:1 <|endoftext|>:0",
+        special_tokens=[
+            ("<|endoftext|>", eot_id),
+        ],
+    )
+    print(f"Post-processor added: <|endoftext|> (ID: {eot_id}) appended to sequences")
+    return tokenizer
+# ------------------------------------------------------------------ #
+#  VERIFICATION
+# ------------------------------------------------------------------ #
+def verify_post_processor(tokenizer: Tokenizer):
+    """
+    Verifies the post-processor is working correctly.
+    Checks that <|endoftext|> appears at end of every encoded sequence.
+    """
+    eot_id    = tokenizer.token_to_id("<|endoftext|>")
+    eot_token = "<|endoftext|>"
+    print("\n" + "="*60)
+    print("  POST-PROCESSOR VERIFICATION")
+    print("="*60 + "\n")
+    test_cases = [
+        # Single documents
+        "The mitochondria is the powerhouse of the cell.",
+        "CO2 levels rose by 1.5e-3 ppm.",
+        # Short edge cases
+        "Hi.",
+        "42",
+    ]
+    all_passed = True
+    for text in test_cases:
+        encoded     = tokenizer.encode(text)
+        last_token  = encoded.tokens[-1]
+        last_id     = encoded.ids[-1]
+        passed      = last_token == eot_token and last_id == eot_id
+        if not passed:
+            all_passed = False
+        status = "PASS" if passed else "FAIL"
+        print(f"[{status}] {repr(text)}")
+        print(f"       tokens : {encoded.tokens}")
+        print(f"       last   : {last_token!r} (ID: {last_id})")
+        print()
+    # Verify pair encoding
+    encoded_pair = tokenizer.encode("question here", "answer here")
+    pair_ids     = encoded_pair.ids
+    eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id]
+    print(f"Pair encoding test:")
+    print(f"  tokens      : {encoded_pair.tokens}")
+    print(f"  eot positions: {eot_positions}")
+    print(f"  expected     : 2 eot tokens (one after each sequence)")
+    print(f"  [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]")
+    print(f"\nAll tests passed: {all_passed}")
+# ------------------------------------------------------------------ #
+#  HOW THIS FITS INTO THE FULL PIPELINE
+# ------------------------------------------------------------------ #
+# The correct order when building your full tokenizer:
+#
+#   1. build_tokenizer()       <- sets up model + pre-tokenizer + decoder
+#   2. train_from_iterator()   <- trains BPE, assigns real vocab IDs
+#   3. add_post_processor()    <- NOW we can add post-processor (needs real IDs)
+#   4. tokenizer.save()        <- saves everything including post-processor
+#
+# Loading later:
+#   tokenizer = Tokenizer.from_file("fineweb_edu_tokenizer.json")
+#   <- post-processor is automatically restored, no extra steps
+if __name__ == "__main__":
+    import sys
+    # Load a trained tokenizer from disk to test
+    # Pass the path as argument: python post_processor.py fineweb_edu_tokenizer.json
+    # Or it will try the default path
+    path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json"
+    print(f"Loading tokenizer from: {path}")
+    tokenizer = Tokenizer.from_file(path)
+    tokenizer = add_post_processor(tokenizer)
+    verify_post_processor(tokenizer)
+    # Save with post-processor included
+    tokenizer.save(path)
+    print(f"\nTokenizer re-saved with post-processor to: {path}")

tokenizer/pretokenizer.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import re
+from tokenizers.pre_tokenizers import PreTokenizer, Split
+from tokenizers import Regex
+#  Each category is defined separately so its easy to understand, modify, or debug individually
+# 1. Contractions
+#    Matches: 's  't  're  've  'll  'm  'd
+#    Example: "don't" -> ["don", "'t"]
+CONTRACTIONS = r"'(?:s|t|re|ve|ll|m|d)"
+# 2. Abbreviations
+#    Matches: letter(s) separated by dots, optional trailing dot
+#    Example: "U.S.A" -> ["U.S.A"]
+#             "e.g."  -> ["e.g."]
+#             "Ph.D"  -> ["Ph.D"]
+#    \b = word boundary, ensures we dont partially match inside a word
+ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?"
+# 3. Scientific Notation
+#    Matches: number, optional decimal, e/E, optional sign, exponent
+#    Example: "1.5e-3"  -> ["1.5e-3"]
+#             "3e10"    -> ["3e10"]
+#             "2.0E+4"  -> ["2.0E+4"]
+#    Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first
+SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+"
+# 4. Decimal Numbers
+#    Matches: digits, dot, digits
+#    Example: "3.14"  -> ["3.14"]
+#             "0.001" -> ["0.001"]
+#    Must come BEFORE integers otherwise "3" in "3.14" matches first
+DECIMALS = r"\d+\.\d+"
+# 5. Integers
+#    Matches: any sequence of digits
+#    Example: "42"   -> ["42"]
+#             "1984" -> ["1984"]
+#    Comes last among numbers since scientific and decimal match first
+INTEGERS = r"\d+"
+# 6. Multi-character Operators
+#    Matches: common programming operators that are 2 characters
+#    Example: "==" -> ["=="]   "!=" -> ["!="]
+#             "->" -> ["->"]   "+=" -> ["+="]
+#    Must come BEFORE single punctuation catch-all
+#    [-+*/]= matches +=, -=, *=, /= in one pattern
+OPERATORS = r"==|!=|->|<=|>=|\*\*|//|[-+*/]="
+# 7. Snake Case Identifiers
+#    Matches: words that contain underscores (code identifiers)
+#    Example: "snake_case"  -> ["snake_case"]
+#             "var_name_2"  -> ["var_name_2"]
+#             "_private"    -> ["_private"]
+#    Must come BEFORE regular words otherwise "snake" matches first
+SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*"
+# 8. Regular Unicode Words
+#    Matches: any sequence of word characters (letters, digits)
+#    \w+ in unicode mode covers non-english letters too
+#    Example: "hello" -> ["hello"]
+#             "café"  -> ["café"]
+WORDS = r"\w+"
+# 9. Whitespace
+#    Newlines are matched separately from spaces/tabs
+#    This preserves document structure (paragraph breaks etc.)
+#    Example: "\n\n" -> ["\n\n"]  "   " -> ["   "]
+WHITESPACE = r"\n+|[ \t]+"
+# 10. Punctuation Catch-all
+#     Matches any single non-whitespace character that nothing above caught
+#     Example: "!" -> ["!"]  "@" -> ["@"]  "." -> ["."]
+PUNCTUATION = r"[^\s]"
+# ------------------------------------------------------------------ #
+#  Combine all patterns in ORDER - first match wins
+# ------------------------------------------------------------------ #
+PRETOKENIZER_PATTERN = "|".join([
+    CONTRACTIONS,   # 1 - most specific first
+    ABBREVIATIONS,  # 2 - before plain words
+    SCIENTIFIC,     # 3 - before decimals
+    DECIMALS,       # 4 - before integers
+    INTEGERS,       # 5
+    OPERATORS,      # 6 - before single punctuation
+    SNAKE_CASE,     # 7 - before plain words
+    WORDS,          # 8
+    WHITESPACE,     # 9
+    PUNCTUATION,    # 10 - catch everything else
+])
+def get_pretokenizer():
+    """
+    Returns a HuggingFace Split pre-tokenizer using our custom regex.
+    Split behavior:
+    - pattern    : the regex to split/match on
+    - behavior   : "removed"  -> splits on matches and discards them
+                   "isolated" -> splits on matches and keeps them as tokens
+                   "merged_with_previous" / "merged_with_next"
+    We use "isolated" because we WANT to keep whitespace, operators,
+    punctuation etc. as their own tokens rather than discard them.
+    """
+    return Split(
+        pattern=Regex(PRETOKENIZER_PATTERN),
+        behavior="isolated",
+        invert=True  # invert=True means: match the pattern and KEEP matches as tokens
+                     # (rather than treating matches as split points)
+    )
+# ------------------------------------------------------------------ #
+#  Quick test - run this file directly to verify behavior
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    from tokenizers import Tokenizer
+    from tokenizers.models import BPE
+    # Build a bare tokenizer just to test the pre-tokenizer
+    tokenizer = Tokenizer(BPE())
+    tokenizer.pre_tokenizer = get_pretokenizer()
+    test_cases = [
+        # Contractions
+        ("Contractions",        "don't she'll they've"),
+        # Abbreviations
+        ("Abbreviations",       "U.S.A has a Ph.D e.g. this"),
+        # Scientific notation
+        ("Scientific",          "the value is 1.5e-3 and 2.0E+4"),
+        # Decimals
+        ("Decimals",            "pi is 3.14159 and e is 2.718"),
+        # Integers
+        ("Integers",            "there are 1000 students in 2024"),
+        # Operators
+        ("Operators",           "if x==0 or y!=1 then z+=2"),
+        # Snake case
+        ("Snake case",          "my_variable and snake_case_name"),
+        # Mixed real world
+        ("Real world",          "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."),
+        # Code like
+        ("Code-like",           "def my_func(x):\n    return x**2 + 1"),
+    ]
+    print(f"\n{'='*60}")
+    print(f"  PRE-TOKENIZER TEST")
+    print(f"{'='*60}\n")
+    for label, text in test_cases:
+        tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
+        token_strings = [t[0] for t in tokens]  # tokens are (string, offset) tuples
+        print(f"[{label}]")
+        print(f"  Input  : {repr(text)}")
+        print(f"  Tokens : {token_strings}")
+        print()

tokenizer/tempCodeRunnerFile.py ADDED Viewed

	@@ -0,0 +1,5 @@

+    with open(os.path.join(save_dir, "special_tokens_map.json"), "w") as f:
+        json.dump(special_tokens_map, f, indent=2)
+    print("special_tokens_map.json written manually")

tokenizer/tokenize_dataset.py ADDED Viewed

	@@ -0,0 +1,389 @@

+"""
+tokenize_dataset.py — Parallel tokenization pipeline
+Architecture:
+    Main thread   : stream HF dataset → filter → normalize → batch texts
+    Worker pool   : N_WORKERS processes, each with own loaded tokenizer,
+                    tokenize batches concurrently using ProcessPoolExecutor
+    Main thread   : collect results IN ORDER → route train/val → flush shards
+Why this is faster:
+    Old code:  stream → [normalize] → [tokenize 1000 docs, 1 CPU] → write
+    New code:  stream → [normalize] → [tokenize 1000 docs × N cores] → write
+    On 12-core machine: expect 6-10× speedup on tokenization step.
+    Bottleneck shifts to HF streaming bandwidth, not CPU.
+Notes:
+    - Workers are initialized ONCE with the tokenizer loaded (no repeated disk reads)
+    - Results collected in SUBMISSION ORDER so train/val routing is deterministic
+    - Sliding window of MAX_PENDING futures keeps all cores busy without
+      unbounded memory growth
+    - Ctrl+C safe: flushes remaining buffers before exit
+"""
+import os
+import sys
+import time
+import warnings
+import numpy as np
+from collections import deque
+from concurrent.futures import ProcessPoolExecutor
+from datasets import load_dataset
+from transformers import PreTrainedTokenizerFast, logging as hf_logging
+from tqdm import tqdm
+# Import normalizer from same directory
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from normalizer import normalization
+hf_logging.set_verbosity_error()
+warnings.filterwarnings("ignore")
+# ------------------------------------------------------------------ #
+#  CONSTANTS
+# ------------------------------------------------------------------ #
+DATASET_NAME     = "HuggingFaceFW/fineweb-edu"
+DATASET_SUBSET   = "CC-MAIN-2014-49"
+SCRIPT_DIR       = os.path.dirname(os.path.abspath(__file__))
+TOKENIZER_DIR    = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer")
+DATA_DIR         = os.path.join(SCRIPT_DIR, "data")
+MIN_QUALITY      = 3
+SHARD_SIZE       = 100_000_000          # tokens per shard (~190 MB at uint16)
+BATCH_SIZE       = 2_000                # docs per tokenization task (↑ from 1000)
+VAL_RATIO        = 100                  # every 100th accepted doc → val
+SHUFFLE_BUFFER   = 10_000
+MIN_DOC_LENGTH   = 100
+DTYPE            = np.uint16
+MAX_TOKENS       = 3_200_000_000
+# Parallel workers: leave 2 cores for OS + HF streaming
+N_WORKERS        = max(1, os.cpu_count() - 2)
+# How many tokenization futures to keep in-flight at once
+# = N_WORKERS × 2 keeps the pipeline full without excess memory
+MAX_PENDING      = N_WORKERS * 2
+# ------------------------------------------------------------------ #
+#  WORKER PROCESS — loaded once per process at startup
+# ------------------------------------------------------------------ #
+# Module-level tokenizer in each worker process
+_worker_tokenizer = None
+def _worker_init(tokenizer_dir: str):
+    """
+    Called ONCE per worker process at startup.
+    Loads the tokenizer into the worker's global state.
+    Subsequent calls to _tokenize_worker_fn reuse this loaded tokenizer.
+    """
+    global _worker_tokenizer
+    import warnings
+    from transformers import PreTrainedTokenizerFast, logging as hf_log
+    hf_log.set_verbosity_error()
+    warnings.filterwarnings("ignore")
+    _worker_tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_dir)
+def _tokenize_worker_fn(texts: list) -> list:
+    """
+    Tokenizes a batch of pre-normalized texts in a worker process.
+    Returns a list of token-ID lists, one per document.
+    Each doc ends with <|endoftext|> (added by add_special_tokens=True).
+    Args:
+        texts : list of normalized strings (already filtered, normalized)
+    Returns:
+        list of list[int]  — token IDs per document
+    """
+    global _worker_tokenizer
+    encoded = _worker_tokenizer(
+        texts,
+        add_special_tokens   = True,   # appends <|endoftext|>
+        truncation           = False,  # keep full document
+        padding              = False,  # no padding (we pack shards)
+        return_attention_mask= False,  # not needed
+    )
+    return encoded["input_ids"]
+# ------------------------------------------------------------------ #
+#  SHARD HELPERS
+# ------------------------------------------------------------------ #
+def get_shard_path(split: str, shard_idx: int) -> str:
+    return os.path.join(DATA_DIR, f"{split}_{shard_idx:03d}.bin")
+def save_shard(tokens: list, split: str, shard_idx: int):
+    arr      = np.array(tokens, dtype=DTYPE)
+    path     = get_shard_path(split, shard_idx)
+    arr.tofile(path)
+    size_mb  = arr.nbytes / 1024 / 1024
+    tqdm.write(f"  saved {split}_{shard_idx:03d}.bin | {len(tokens):,} tokens | {size_mb:.1f} MB")
+# ------------------------------------------------------------------ #
+#  ROUTE BATCH RESULTS → train / val buffers
+# ------------------------------------------------------------------ #
+def route_results(
+    all_ids        : list,
+    doc_count_start: int,
+    train_buffer   : list,
+    val_buffer     : list,
+    train_tokens   : int,
+    val_tokens     : int,
+    total_tokens   : int,
+) -> tuple:
+    """
+    Routes tokenized docs to train or val buffer by doc index.
+    Returns updated (train_buffer, val_buffer, train_tokens, val_tokens, total_tokens, batch_tok_count).
+    """
+    batch_tok_count = 0
+    for i, ids in enumerate(all_ids):
+        doc_num = doc_count_start + i
+        if doc_num % VAL_RATIO == 0:      # every 100th doc → val
+            val_buffer.extend(ids)
+            val_tokens   += len(ids)
+        else:
+            train_buffer.extend(ids)
+            train_tokens += len(ids)
+        total_tokens    += len(ids)
+        batch_tok_count += len(ids)
+    return train_buffer, val_buffer, train_tokens, val_tokens, total_tokens, batch_tok_count
+# ------------------------------------------------------------------ #
+#  MAIN PARALLEL TOKENIZATION PIPELINE
+# ------------------------------------------------------------------ #
+def tokenize_dataset():
+    os.makedirs(DATA_DIR, exist_ok=True)
+    print(f"Loading tokenizer from: {TOKENIZER_DIR}")
+    print(f"  workers      : {N_WORKERS} of {os.cpu_count()} CPUs")
+    print(f"\nLoading dataset stream: {DATASET_NAME} / {DATASET_SUBSET}")
+    ds = load_dataset(
+        DATASET_NAME,
+        name         = DATASET_SUBSET,
+        split        = "train",
+        streaming    = True,
+    ).shuffle(buffer_size=SHUFFLE_BUFFER, seed=42)
+    # ---- State ------------------------------------------------------ #
+    train_buffer  = []
+    val_buffer    = []
+    train_shard   = 0
+    val_shard     = 0
+    total_docs    = 0
+    skipped_docs  = 0
+    total_tokens  = 0
+    train_tokens  = 0
+    val_tokens    = 0
+    batch_texts   = []          # accumulating next batch to submit
+    batch_doc_start = 0         # doc index at start of current batch_texts
+    # pending: deque of (future, doc_count_start)
+    # We always pop from the LEFT (oldest submission) to preserve order
+    pending       = deque()
+    cap_reached   = False
+    # ---- Progress bars ----------------------------------------------- #
+    token_bar = tqdm(
+        total=MAX_TOKENS,
+        desc="tokens",
+        unit="tok",
+        unit_scale=True,
+        unit_divisor=1000,
+        colour="green",
+        position=0,
+    )
+    doc_bar = tqdm(
+        desc="docs  ",
+        unit="doc",
+        unit_scale=True,
+        colour="blue",
+        position=1,
+    )
+    t_start = time.time()
+    # ------------------------------------------------------------------ #
+    #  DRAIN HELPER — collect the oldest pending future and process it
+    # ------------------------------------------------------------------ #
+    def drain_one():
+        nonlocal train_buffer, val_buffer, train_shard, val_shard
+        nonlocal total_tokens, train_tokens, val_tokens
+        if not pending:
+            return False
+        future, doc_start = pending.popleft()
+        all_ids           = future.result()          # blocks until this task done
+        (train_buffer, val_buffer,
+         train_tokens, val_tokens,
+         total_tokens, batch_tok) = route_results(
+            all_ids, doc_start,
+            train_buffer, val_buffer,
+            train_tokens, val_tokens, total_tokens,
+        )
+        token_bar.update(batch_tok)
+        token_bar.set_postfix({
+            "train": f"{train_tokens/1e9:.2f}B",
+            "val"  : f"{val_tokens/1e6:.0f}M",
+            "shards": train_shard,
+        })
+        # Flush train shards
+        while len(train_buffer) >= SHARD_SIZE:
+            save_shard(train_buffer[:SHARD_SIZE], "train", train_shard)
+            train_buffer = train_buffer[SHARD_SIZE:]
+            train_shard += 1
+        # Flush val shards
+        while len(val_buffer) >= SHARD_SIZE:
+            save_shard(val_buffer[:SHARD_SIZE], "val", val_shard)
+            val_buffer = val_buffer[SHARD_SIZE:]
+            val_shard += 1
+        return True
+    # ------------------------------------------------------------------ #
+    #  MAIN LOOP with ProcessPoolExecutor
+    # ------------------------------------------------------------------ #
+    print(f"\nStarting tokenization...")
+    print(f"  token target : {MAX_TOKENS:,}")
+    print(f"  shard size   : {SHARD_SIZE:,} tokens")
+    print(f"  batch size   : {BATCH_SIZE} docs")
+    print(f"  val ratio    : every {VAL_RATIO}th doc")
+    print(f"  quality      : int_score >= {MIN_QUALITY}\n")
+    with ProcessPoolExecutor(
+        max_workers  = N_WORKERS,
+        initializer  = _worker_init,
+        initargs     = (TOKENIZER_DIR,),
+    ) as executor:
+        for doc in ds:
+            # ---- Quality filter ------------------------------------ #
+            if doc["int_score"] < MIN_QUALITY:
+                skipped_docs += 1
+                doc_bar.set_postfix({"skipped": skipped_docs})
+                continue
+            # ---- Length + normalize -------------------------------- #
+            text = doc["text"]
+            if len(text) < MIN_DOC_LENGTH:
+                skipped_docs += 1
+                doc_bar.set_postfix({"skipped": skipped_docs})
+                continue
+            text = normalization(text)
+            if len(text) < MIN_DOC_LENGTH:
+                skipped_docs += 1
+                doc_bar.set_postfix({"skipped": skipped_docs})
+                continue
+            batch_texts.append(text)
+            total_docs += 1
+            doc_bar.update(1)
+            # ---- Submit batch when full ---------------------------- #
+            if len(batch_texts) == BATCH_SIZE:
+                # Record which doc index this batch starts at
+                doc_start = total_docs - BATCH_SIZE
+                future = executor.submit(_tokenize_worker_fn, batch_texts)
+                pending.append((future, doc_start))
+                batch_texts = []
+                # ---- Backpressure: drain oldest if queue full ------- #
+                # This prevents unbounded memory accumulation
+                # while keeping all N_WORKERS busy
+                while len(pending) >= MAX_PENDING:
+                    drain_one()
+                # ---- Check token cap -------------------------------- #
+                if total_tokens >= MAX_TOKENS:
+                    tqdm.write(f"\nToken cap reached: {total_tokens:,} tokens from {total_docs:,} docs")
+                    cap_reached = True
+                    break
+        # ---- Submit any remaining partial batch -------------------- #
+        if batch_texts and not cap_reached:
+            doc_start = total_docs - len(batch_texts)
+            future    = executor.submit(_tokenize_worker_fn, batch_texts)
+            pending.append((future, doc_start))
+        # ---- Drain all remaining pending futures ------------------- #
+        while pending:
+            drain_one()
+    # ---- Close progress bars --------------------------------------- #
+    token_bar.close()
+    doc_bar.close()
+    # ---- Save remaining partial shards ----------------------------- #
+    if train_buffer:
+        save_shard(train_buffer, "train", train_shard)
+        train_shard += 1
+    if val_buffer:
+        save_shard(val_buffer, "val", val_shard)
+        val_shard += 1
+    # ---- Final summary --------------------------------------------- #
+    print(f"\n{'='*60}")
+    print(f"  TOKENIZATION COMPLETE")
+    print(f"{'='*60}")
+    print(f"  total docs     : {total_docs:,}")
+    print(f"  skipped docs   : {skipped_docs:,}")
+    print(f"  total tokens   : {total_tokens:,}")
+    print(f"  train tokens   : {train_tokens:,}")
+    print(f"  val tokens     : {val_tokens:,}")
+    print(f"  train shards   : {train_shard}")
+    print(f"  val shards     : {val_shard}")
+    print(f"  data dir       : {os.path.abspath(DATA_DIR)}")
+# ------------------------------------------------------------------ #
+#  LOAD SHARDS DURING TRAINING (unchanged)
+# ------------------------------------------------------------------ #
+def load_shard(split: str, shard_idx: int) -> np.ndarray:
+    """
+    Loads a shard as a memory-mapped numpy array.
+    The full shard never loads into RAM at once.
+    Usage during training:
+        shard = load_shard("train", 0)
+        chunk = shard[i : i + 1024]
+    """
+    path = get_shard_path(split, shard_idx)
+    return np.memmap(path, dtype=DTYPE, mode="r")
+# ------------------------------------------------------------------ #
+#  ENTRY POINT
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    # Windows requires this guard for multiprocessing with spawn start method
+    tokenize_dataset()

tokenizer/traintokenizer.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from datasets import load_dataset
+from tokenizers import Tokenizer
+# Import our components
+from normalizer import normalization          # our normalize function
+from bpe import build_tokenizer, build_trainer, get_special_token_ids
+from post_processor import add_post_processor
+# ------------------------------------------------------------------ #
+#  CONSTANTS
+# ------------------------------------------------------------------ #
+DATASET_NAME    = "HuggingFaceFW/fineweb-edu"
+DATASET_SUBSET  = "CC-MAIN-2014-49"
+MIN_QUALITY     = 3          # int_score >= 3 only
+MAX_TOKENS      = 25_000_000 # ~100M characters worth, enough for BPE training
+                             # FineWeb-Edu tokens avg 4-5 chars each
+MIN_DOC_LENGTH  = 100        # skip very short documents, likely boilerplate
+import os
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+SAVE_PATH  = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer")
+# ------------------------------------------------------------------ #
+#  DATA GENERATOR
+# ------------------------------------------------------------------ #
+def fineweb_edu_iterator(
+    max_tokens: int = MAX_TOKENS,
+    min_quality: int = MIN_QUALITY,
+    min_length: int = MIN_DOC_LENGTH,
+):
+    """
+    Streams FineWeb-Edu documents, filters by quality,
+    normalizes text, and yields clean strings for BPE training.
+    Args:
+        max_tokens  : stop after consuming this many tokens total
+        min_quality : only yield docs with int_score >= this value
+        min_length  : skip docs shorter than this many characters
+    Yields:
+        str: normalized, clean document text
+    """
+    print(f"Loading dataset stream: {DATASET_NAME} / {DATASET_SUBSET}")
+    ds = load_dataset(
+        DATASET_NAME,
+        name=DATASET_SUBSET,
+        split="train",
+        streaming=True,
+    )
+    tokens_seen   = 0   # running total of tokens consumed
+    docs_yielded  = 0   # how many docs passed all filters
+    docs_skipped  = 0   # how many docs were filtered out
+    for doc in ds:
+        # ---- Stop condition ----------------------------------------
+        if tokens_seen >= max_tokens:
+            break
+        # ---- Quality filter ----------------------------------------
+        # int_score is 0-5, we want educational quality >= 3
+        if doc["int_score"] < min_quality:
+            docs_skipped += 1
+            continue
+        # ---- Extract and normalize ---------------------------------
+        text = doc["text"]
+        # Skip very short documents before normalization
+        # (saves compute on boilerplate/empty docs)
+        if len(text) < min_length:
+            docs_skipped += 1
+            continue
+        # Run our normalization pipeline
+        text = normalization(text)
+        # Skip if normalization made it too short
+        # (e.g. doc was mostly HTML tags or control chars)
+        if len(text) < min_length:
+            docs_skipped += 1
+            continue
+        # ---- Track progress ----------------------------------------
+        tokens_seen  += doc["token_count"]
+        docs_yielded += 1
+        # Log progress every 100k documents
+        if docs_yielded % 100_000 == 0:
+            print(
+                f"  docs yielded: {docs_yielded:,} | "
+                f"docs skipped: {docs_skipped:,} | "
+                f"tokens seen: {tokens_seen:,} / {max_tokens:,} "
+                f"({100 * tokens_seen / max_tokens:.1f}%)"
+            )
+        yield text
+    # Final stats
+    print(f"\nStream complete:")
+    print(f"  docs yielded : {docs_yielded:,}")
+    print(f"  docs skipped : {docs_skipped:,}")
+    print(f"  tokens seen  : {tokens_seen:,}")
+# ------------------------------------------------------------------ #
+#  TRAINING
+# ------------------------------------------------------------------ #
+def train_tokenizer() -> Tokenizer:
+    """
+    Builds, trains, and saves the tokenizer.
+    Returns:
+        Trained Tokenizer object
+    """
+    # Build untrained tokenizer and trainer
+    tokenizer = build_tokenizer()
+    trainer   = build_trainer()
+    print("\nStarting BPE training...")
+    print(f"  vocab size    : {trainer.vocab_size:,}")
+    print(f"  min frequency : {trainer.min_frequency}")
+    print(f"  quality filter: int_score >= {MIN_QUALITY}")
+    print(f"  max tokens    : {MAX_TOKENS:,}\n")
+    # train_from_iterator expects an iterable of strings
+    # our generator yields one clean document string at a time
+    tokenizer.train_from_iterator(
+        iterator=fineweb_edu_iterator(),
+        trainer=trainer,
+        length=MAX_TOKENS,   # optional hint for progress bar accuracy
+    )
+    print("\nTraining complete.")
+    tokenizer = add_post_processor(tokenizer)
+    # Print special token IDs
+    ids = get_special_token_ids(tokenizer)
+    print(f"\nSpecial token IDs:")
+    for token, token_id in ids.items():
+        print(f"  {token} -> {token_id}")
+    # Save tokenizer to disk
+    tokenizer.save(f"{SAVE_PATH}.json")
+    print(f"\nTokenizer saved to: {SAVE_PATH}.json")
+    return tokenizer
+# ------------------------------------------------------------------ #
+#  QUICK VERIFICATION after training
+# ------------------------------------------------------------------ #
+def verify_tokenizer(tokenizer: Tokenizer):
+    """
+    Runs a few quick checks after training to verify correctness.
+    """
+    print("\n" + "="*60)
+    print("  TOKENIZER VERIFICATION")
+    print("="*60 + "\n")
+    test_cases = [
+        "The mitochondria is the powerhouse of the cell.",
+        "CO2 levels rose by 1.5e-3 ppm in 2024.",
+        "def compute_loss(y_pred, y_true):\n    return (y_pred - y_true)**2",
+        "U.S.A has a Ph.D program e.g. at MIT.",
+        "don't they've she'll",
+        "∇f(x) = 0 is a necessary condition.",   # tests byte fallback
+    ]
+    for text in test_cases:
+        encoded  = tokenizer.encode(text)
+        decoded  = tokenizer.decode(encoded.ids)
+        n_tokens = len(encoded.ids)
+        print(f"Input   : {repr(text)}")
+        print(f"Tokens  : {encoded.tokens}")
+        print(f"IDs     : {encoded.ids}")
+        print(f"N tokens: {n_tokens}")
+        print(f"Decoded : {repr(decoded)}")
+        print(f"Lossless: {text == decoded}")
+        print()
+    # Verify vocab size
+    vocab_size = tokenizer.get_vocab_size()
+    print(f"Final vocab size: {vocab_size:,}")
+    # Verify endoftext token exists
+    eot_id = tokenizer.token_to_id("<|endoftext|>")
+    print(f"<|endoftext|> ID: {eot_id}")
+# ------------------------------------------------------------------ #
+#  ENTRY POINT
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    tokenizer = train_tokenizer()
+    verify_tokenizer(tokenizer)

tokenizer/wrap_tokenizer.py ADDED Viewed

	@@ -0,0 +1,232 @@

+from tokenizers import Tokenizer
+from transformers import PreTrainedTokenizerFast
+import json
+import os
+# ------------------------------------------------------------------ #
+#  CONSTANTS
+# ------------------------------------------------------------------ #
+import os
+SCRIPT_DIR        = os.path.dirname(os.path.abspath(__file__))
+TOKENIZER_PATH    = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer.json")
+SAVE_DIR          = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer")         # output folder
+MODEL_MAX_LENGTH  = 1024                            # context length
+PADDING_SIDE      = "right"                         # causal LM standard
+# ------------------------------------------------------------------ #
+#  WRAP
+# ------------------------------------------------------------------ #
+def wrap_tokenizer(
+    tokenizer_path: str = TOKENIZER_PATH,
+    save_dir: str       = SAVE_DIR,
+) -> PreTrainedTokenizerFast:
+    """
+    Wraps a trained HuggingFace Tokenizer as a PreTrainedTokenizerFast.
+    This gives us:
+    - datasets.map() compatibility for bulk tokenization
+    - HuggingFace Trainer + DataCollator compatibility
+    - Automatic padding, truncation, attention masks
+    - from_pretrained() loading support
+    - return_tensors="pt" for PyTorch tensors
+    Args:
+        tokenizer_path : path to trained tokenizer .json file
+        save_dir       : folder to save the wrapped tokenizer
+    Returns:
+        PreTrainedTokenizerFast ready for training
+    """
+    print(f"Loading trained tokenizer from: {tokenizer_path}")
+    base_tokenizer = Tokenizer.from_file(tokenizer_path)
+    # ---- Wrap --------------------------------------------------------
+    # We map <|endoftext|> to all three roles:
+    #
+    #   eos_token  - end of sequence marker, used during generation
+    #                to know when to stop
+    #
+    #   bos_token  - beginning of sequence, GPT-2 style uses eos
+    #                for both since there is no separate BOS token
+    #
+    #   pad_token  - safe to reuse eos here because we are packing
+    #                sequences and will never actually pad during
+    #                pretraining. Defined so HuggingFace doesn't
+    #                complain about missing pad token
+    #
+    #   unk_token  - None because byte-level means no unknowns ever
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_object=base_tokenizer,
+        # Special token mappings
+        eos_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        unk_token=None,
+        # Context length
+        model_max_length=MODEL_MAX_LENGTH,
+        # Padding behavior
+        padding_side=PADDING_SIDE,
+        # Truncation side - truncate from the right
+        # (keep the beginning of the sequence, drop the end)
+        truncation_side="right",
+    )
+    tokenizer.add_special_tokens({
+    "eos_token": "<|endoftext|>",
+    "bos_token": "<|endoftext|>",
+    "pad_token": "<|endoftext|>",
+})
+    special_tokens_map = {
+    "bos_token": "<|endoftext|>",
+    "eos_token": "<|endoftext|>",
+    "pad_token": "<|endoftext|>",
+}
+    os.makedirs(save_dir, exist_ok=True)
+    with open(os.path.join(save_dir, "special_tokens_map.json"), "w") as f:
+        json.dump(special_tokens_map, f, indent=2)
+    print("special_tokens_map.json written manually")
+    # ---- Save --------------------------------------------------------
+    # Saves three files to save_dir/:
+    #   tokenizer.json          - the trained BPE tokenizer
+    #   tokenizer_config.json   - max length, pad token, special tokens
+    #   special_tokens_map.json - maps eos/bos/pad to actual tokens
+    tokenizer.save_pretrained(save_dir)
+    print(f"Tokenizer saved to: {save_dir}/")
+    print(f"  tokenizer.json")
+    print(f"  tokenizer_config.json")
+    print(f"  special_tokens_map.json")
+    return tokenizer
+# ------------------------------------------------------------------ #
+#  VERIFICATION
+# ------------------------------------------------------------------ #
+def verify_wrapped_tokenizer(tokenizer: PreTrainedTokenizerFast):
+    """
+    Verifies the wrapped tokenizer behaves correctly.
+    Tests encoding, decoding, padding, truncation and batch encoding.
+    """
+    print("\n" + "="*60)
+    print("  WRAPPED TOKENIZER VERIFICATION")
+    print("="*60 + "\n")
+    eot_id = tokenizer.eos_token_id
+    # ---- 1. Basic config -----------------------------------------
+    print("Config:")
+    print(f"  vocab size       : {tokenizer.vocab_size:,}")
+    print(f"  model_max_length : {tokenizer.model_max_length}")
+    print(f"  padding_side     : {tokenizer.padding_side}")
+    print(f"  eos_token        : {tokenizer.eos_token!r} (ID: {eot_id})")
+    print(f"  bos_token        : {tokenizer.bos_token!r}")
+    print(f"  pad_token        : {tokenizer.pad_token!r} (ID: {tokenizer.pad_token_id})")
+    print(f"  unk_token        : {tokenizer.unk_token!r}")
+    print()
+    # ---- 2. Basic encode/decode ----------------------------------
+    text = "The mitochondria is the powerhouse of the cell."
+    encoded = tokenizer(text)
+    decoded = tokenizer.decode(encoded["input_ids"])
+    print("Basic encode/decode:")
+    print(f"  input    : {repr(text)}")
+    print(f"  input_ids: {encoded['input_ids']}")
+    print(f"  decoded  : {repr(decoded)}")
+    print()
+    # ---- 3. Padding ----------------------------------------------
+    # Batch of two sequences with different lengths
+    # shorter one should be right-padded to match the longer
+    batch = [
+        "Short sentence.",
+        "This is a much longer sentence that has more tokens in it.",
+    ]
+    encoded_batch = tokenizer(
+        batch,
+        padding=True,        # pad to longest in batch
+        return_tensors="pt", # return PyTorch tensors
+    )
+    print("Batch padding (right padding):")
+    print(f"  input_ids shape      : {encoded_batch['input_ids'].shape}")
+    print(f"  attention_mask shape : {encoded_batch['attention_mask'].shape}")
+    print(f"  input_ids[0]         : {encoded_batch['input_ids'][0].tolist()}")
+    print(f"  input_ids[1]         : {encoded_batch['input_ids'][1].tolist()}")
+    print(f"  attention_mask[0]    : {encoded_batch['attention_mask'][0].tolist()}")
+    print()
+    # ---- 4. Truncation -------------------------------------------
+    # Sequence longer than model_max_length should be truncated
+    long_text = "word " * 2000   # 2000 words >> 1024 tokens
+    encoded_long = tokenizer(
+        long_text,
+        truncation=True,
+        max_length=MODEL_MAX_LENGTH,
+    )
+    print("Truncation:")
+    print(f"  input length : {len(long_text.split())} words")
+    print(f"  token count  : {len(encoded_long['input_ids'])} (max: {MODEL_MAX_LENGTH})")
+    print(f"  truncated    : {len(encoded_long['input_ids']) <= MODEL_MAX_LENGTH}")
+    print()
+    # ---- 5. Load from disk and verify ----------------------------
+    print("Loading from disk:")
+    reloaded = PreTrainedTokenizerFast.from_pretrained(SAVE_DIR)
+    reloaded_ids = reloaded(text)["input_ids"]
+    original_ids = encoded["input_ids"]
+    match = reloaded_ids == original_ids
+    print(f"  from_pretrained() : OK")
+    print(f"  IDs match original: {match}")
+# ------------------------------------------------------------------ #
+#  ENTRY POINT
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    tokenizer = wrap_tokenizer()
+    verify_wrapped_tokenizer(tokenizer)
+    print("\n" + "="*60)
+    print("  USAGE EXAMPLES")
+    print("="*60)
+    print("""
+# Load anywhere with one line
+from transformers import PreTrainedTokenizerFast
+tokenizer = PreTrainedTokenizerFast.from_pretrained("fineweb_edu_tokenizer")
+# Single encode
+ids = tokenizer("Hello world")["input_ids"]
+# Batch encode with padding and tensors
+batch = tokenizer(
+    ["sentence one", "sentence two"],
+    padding=True,
+    truncation=True,
+    max_length=1024,
+    return_tensors="pt",
+)
+# Decode
+text = tokenizer.decode(ids, skip_special_tokens=True)
+# Get eos token id (use as document separator when packing)
+eot_id = tokenizer.eos_token_id
+""")

tokenizer_walkthrough.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# Walkthrough: SLLM Custom BPE Tokenizer
+This document explains the architecture, execution pipeline, and design choices of the custom **Byte-Pair Encoding (BPE)** tokenizer implemented in the `tokenizer/` directory of the `sllm` project.
+---
+## 🏗️ Overall Architecture & Pipeline
+The SLLM tokenizer is a custom-built BPE tokenizer tailored for pre-training small language models on the educational subset of HuggingFace's **FineWeb-Edu** dataset. It integrates custom text normalization, a regex-based pre-tokenization strategy, standard BPE training with byte-level fallback, and packaging utility scripts for high-performance training.
+```mermaid
+graph TD
+    A[Raw Text Stream] --> B[normalizer.py: Normalization]
+    B --> C[pretokenizer.py: Custom Regex Split]
+    C --> D[bpe.py: Byte-Level Encoding]
+    D --> E[traintokenizer.py: BPE Trainer]
+    E --> F[post_processor.py: Template Post-Processing]
+    F --> G[wrap_tokenizer.py: PreTrainedTokenizerFast Wrapper]
+    G --> H[tokenize_dataset.py: Packed binary .bin Shards]
+```
+---
+## 📁 Component-by-Component Breakdown
+### 1. `normalizer.py` (Text Normalization)
+Before any splitting occurs, the raw input text is standardized and cleaned to eliminate noise while preserving syntax and code structure:
+* **HTML Stripping & Decoding**: Removes HTML tags using regex and decodes HTML entities (e.g., `&amp;` $\rightarrow$ `&`).
+* **Unicode Normalization**: Performs **NFC** normalization to ensure characters like accented letters are represented consistently.
+* **Noise Removal**: Eliminates raw control characters, zero-width characters (e.g., zero-width spaces/joins), and the Unicode replacement character (`\ufffd`).
+* **Whitespace Control**:
+  * Collapses multiple consecutive spaces into a single space (preserving leading tabs for code indentation).
+  * Cleans trailing whitespaces at the end of lines.
+  * Collapses 3 or more consecutive newlines into exactly two newlines (`\n\n`) to preserve paragraph structure.
+---
+### 2. `pretokenizer.py` (Custom Regex Segmentation)
+Instead of relying on standard GPT-2/Llama pre-tokenization, this model implements a custom, ordered, priority-based regex pre-tokenizer:
+1. **Contractions**: `'s`, `'t`, `'re`, `'ve`, `'ll`, `'m`, `'d`.
+2. **Abbreviations**: Acronyms and shorthand (e.g., `U.S.A`, `e.g.`, `Ph.D`).
+3. **Scientific Notation**: E.g., `1.5e-3`, `3e10`, `2.0E+4` (evaluated *before* decimals to avoid splitting).
+4. **Decimal Numbers**: E.g., `3.14` (evaluated *before* integers).
+5. **Integers**: E.g., `42`, `1984`.
+6. **Multi-character Operators**: Common coding operators like `==`, `!=`, `->`, `<=`, `>=`, `**`, `//`, `+=`, `-=`, `*=`, `/=`.
+7. **Snake Case Identifiers**: E.g., `snake_case`, `_private` (evaluated *before* plain words for clean code representation).
+8. **Regular Unicode Words**: Alphanumeric words covering non-English languages.
+9. **Whitespace**: Preserves sequences of spaces/tabs separately from newlines to keep structural formatting.
+10. **Punctuation Catch-all**: Individual punctuation characters.
+> [!NOTE]
+> The pre-tokenizer uses HuggingFace's `Split` pre-tokenizer with `behavior="isolated"` and `invert=True`, meaning matched strings are isolated and kept as distinct, individual tokens instead of being discarded as delimiters.
+---
+### 3. `bpe.py` (BPE Model Configuration)
+Defines the base tokenizer pipeline:
+* **Byte Fallback**: Configures the BPE model with `unk_token=None` and `byte_fallback=True`. This guarantees that *every* character maps to at least one byte-level token, resulting in **zero out-of-vocabulary (OOV)** issues.
+* **Pre-Tokenizer Chain**: Sequentially runs the custom Regex pre-tokenizer followed by `ByteLevel(add_prefix_space=False)` to translate character segments to their corresponding byte values.
+* **Decoder**: Instantiates the standard `ByteLevelDecoder` to reverse byte conversions, allowing human-readable decoded strings.
+* **Trainer Config**: Builds a `BpeTrainer` specifying a vocabulary of `32,000` tokens, minimum merge frequency of `3`, and initial alphabet containing all `256` bytes to enforce the fallback capability.
+---
+### 4. `post_processor.py` (Sequence Endings)
+Once BPE rules have been learned and vocabulary IDs are assigned:
+* Attaches `TemplateProcessing` to automatically append `<|endoftext|>` to every sequence.
+* For single documents, it maps to `[tokens...] <|endoftext|>`.
+* For sequence pairs (useful in downstream tasks like Question-Answering), it automatically maps to `[tokens_A...] <|endoftext|> [tokens_B...] <|endoftext|>`.
+---
+### 5. `traintokenizer.py` (BPE Training Loop)
+* Streams the educational subset of `HuggingFaceFW/fineweb-edu` (`CC-MAIN-2014-49` split).
+* Filters out low-quality documents (requires educational score `int_score >= 3`) and documents shorter than 100 characters.
+* Feeds documents iteratively into BPE training via `train_from_iterator()`.
+* Adds the post-processor and runs comprehensive verification checks against edge cases (equations, scientific numbers, code snippets, byte fallbacks, and contractions).
+---
+### 6. `wrap_tokenizer.py` (HuggingFace Integration)
+Wraps the trained HuggingFace BPE model into `PreTrainedTokenizerFast` from `transformers`:
+* Associates `<|endoftext|>` as the `bos_token`, `eos_token`, and `pad_token`.
+* Enables compatibility with the `datasets.map()` bulk utility, the HuggingFace Trainer, and PyTorch dataloaders.
+* Standardizes right-padding, right-truncation, and context length configurations (`model_max_length=1024`).
+---
+### 7. `tokenize_dataset.py` (Dataset Packing)
+A highly optimized bulk-tokenization utility:
+* Tokenizes the streamed FineWeb-Edu dataset up to a target cap (e.g., `3.2` Billion tokens).
+* Performs a 99% train and 1% validation split (every 100th document is routed to the validation buffer).
+* Concatenates/packs documents sequentially (using `<|endoftext|>` as the document boundary) and writes them to disk as high-performance flat binary shards (`.bin` files of `np.uint16` type).
+* Standard shard size is `100,000,000` tokens.
+* Provides a memory-mapped helper `load_shard(split, shard_idx)` using `np.memmap` so that models can stream training batches without loading multi-gigabyte files into RAM.
+---
+## 💡 Key Design Highlights
+> [!TIP]
+> **Why Byte Fallback is Critical**: By initializing the alphabet with 256 unique byte values and enabling fallback, characters like math symbols ($\nabla$) or emojis don't fail or return an `<unk>` token; instead, they represent themselves as their raw UTF-8 bytes (e.g., $\nabla$ is parsed perfectly as `<0xE2><0x88><0x87>`).
+> [!TIP]
+> **Code-Aware Features**: The combination of preserving leading tabs in `normalizer.py`, isolating multi-character operators (`==`, `!=`, etc.), and protecting `snake_case` variables guarantees high-fidelity, compact token representation when the language model is trained on code.

train.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""
+train.py — SLLM Training Loop
+Supports:
+  --max_steps N       Run for exactly N steps then save checkpoint and exit.
+                      Omit to train indefinitely (until Ctrl+C or data exhausted).
+  --resume            Resume from the latest checkpoint in --run_dir.
+  --config 100M|150M  Choose model config (default: 100M).
+  --synthetic         Use synthetic data (for testing without real shards).
+Features:
+  - bf16 mixed precision (autocast) + GradScaler for stable training
+  - Gradient accumulation: --grad_accum N steps per optimizer update
+  - Gradient checkpointing: --grad_checkpoint to save VRAM
+  - Cosine LR schedule with linear warmup
+  - Checkpoint save every --save_every steps (and on clean exit/Ctrl+C)
+  - Metric logging to <run_dir>/train_log.jsonl (one JSON line per log step)
+  - Real-time terminal progress with tqdm
+Recommended for RTX 3050 4GB:
+  python train.py --config 100M --batch_size 4 --grad_accum 8 \\
+                  --grad_checkpoint --max_steps 1000
+Run for N steps, stop, then resume:
+  python train.py --max_steps 500 --run_dir runs/my_run
+  python train.py --max_steps 500 --run_dir runs/my_run --resume
+"""
+import os
+import sys
+import json
+import math
+import time
+import signal
+import argparse
+import torch
+import torch.nn.functional as F
+from torch.amp import autocast, GradScaler
+from tqdm import tqdm
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model.config import SLLM_100M, SLLM_150M, ModelConfig
+from model.model  import SLLM
+from data.dataloader import build_dataloader
+# ------------------------------------------------------------------ #
+#  ARG PARSING
+# ------------------------------------------------------------------ #
+def parse_args():
+    p = argparse.ArgumentParser(description="SLLM Training Loop")
+    # Run management
+    p.add_argument("--run_dir",      type=str,   default="runs/run_001", help="Directory for checkpoints and logs")
+    p.add_argument("--run_name",     type=str,   default=None,           help="Override run name (defaults to run_dir basename)")
+    p.add_argument("--resume",       action="store_true",                 help="Resume from latest checkpoint in run_dir")
+    p.add_argument("--max_steps",    type=int,   default=None,           help="Absolute step target — stop when step reaches this number.")
+    p.add_argument("--extra_steps",  type=int,   default=None,           help="Run N MORE steps from current checkpoint (relative). Converted to --max_steps internally.")
+    # Model
+    p.add_argument("--config",     type=str,   default="100M",         choices=["100M", "150M"])
+    # Data
+    p.add_argument("--data_dir",   type=str,   default="tokenizer/data")
+    p.add_argument("--synthetic",  action="store_true",                 help="Use synthetic random data (for testing)")
+    p.add_argument("--num_workers",type=int,   default=2)
+    # Training
+    p.add_argument("--batch_size",    type=int,   default=4,    help="Per-device batch size")
+    p.add_argument("--grad_accum",    type=int,   default=8,    help="Gradient accumulation steps")
+    p.add_argument("--max_lr",        type=float, default=3e-4)
+    p.add_argument("--min_lr",        type=float, default=3e-5)
+    p.add_argument("--warmup_steps",  type=int,   default=100)
+    p.add_argument("--weight_decay",  type=float, default=0.1)
+    p.add_argument("--grad_clip",     type=float, default=1.0,  help="Gradient clipping norm (0 = disabled)")
+    # Memory
+    p.add_argument("--grad_checkpoint", action="store_true",    help="Enable gradient checkpointing (saves VRAM, slower)")
+    p.add_argument("--dtype",           type=str, default="bf16", choices=["fp32", "fp16", "bf16"])
+    # Logging / Saving
+    p.add_argument("--log_every",  type=int,   default=10,   help="Log metrics every N optimizer steps")
+    p.add_argument("--save_every", type=int,   default=500,  help="Save checkpoint every N optimizer steps")
+    p.add_argument("--val_every",  type=int,   default=250,  help="Run validation every N optimizer steps")
+    p.add_argument("--val_steps",  type=int,   default=20,   help="Number of val batches to average")
+    return p.parse_args()
+# ------------------------------------------------------------------ #
+#  LEARNING RATE SCHEDULE
+# ------------------------------------------------------------------ #
+def get_lr(step: int, warmup_steps: int, total_steps: int, max_lr: float, min_lr: float) -> float:
+    """
+    Linear warmup then cosine decay.
+    If total_steps is None (training indefinitely), uses a fixed 10k step decay window.
+    """
+    # Linear warmup
+    if step < warmup_steps:
+        return max_lr * (step + 1) / warmup_steps
+    # After decay: hold at min_lr
+    decay_steps = total_steps if total_steps else 10_000
+    if step >= decay_steps:
+        return min_lr
+    # Cosine decay
+    progress = (step - warmup_steps) / max(1, decay_steps - warmup_steps)
+    coeff    = 0.5 * (1.0 + math.cos(math.pi * progress))
+    return min_lr + coeff * (max_lr - min_lr)
+# ------------------------------------------------------------------ #
+#  OPTIMIZER (AdamW with selective weight decay)
+# ------------------------------------------------------------------ #
+def build_optimizer(model: SLLM, lr: float, weight_decay: float) -> torch.optim.AdamW:
+    """
+    AdamW with weight decay applied only to 2D params (Linear weights).
+    Excludes: embeddings, norms (RMSNorm weight vectors), biases.
+    This is the standard approach from GPT-2/NanoGPT.
+    """
+    decay_params    = []
+    no_decay_params = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+        # 2D tensors (weight matrices) get weight decay
+        if param.dim() >= 2:
+            decay_params.append(param)
+        else:
+            # 1D: norm weights, biases, embeddings
+            no_decay_params.append(param)
+    optim_groups = [
+        {"params": decay_params,    "weight_decay": weight_decay},
+        {"params": no_decay_params, "weight_decay": 0.0},
+    ]
+    n_decay    = sum(p.numel() for p in decay_params)
+    n_no_decay = sum(p.numel() for p in no_decay_params)
+    print(f"  Optimizer: {n_decay/1e6:.1f}M decay params | {n_no_decay/1e6:.1f}M no-decay params")
+    return torch.optim.AdamW(optim_groups, lr=lr, betas=(0.9, 0.95), eps=1e-8, fused=True)
+# ------------------------------------------------------------------ #
+#  CHECKPOINT SAVE / LOAD
+# ------------------------------------------------------------------ #
+def save_checkpoint(path: str, model: SLLM, optimizer, step: int, args, loss: float):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    torch.save({
+        "step":                step,
+        "model_state_dict":    model.state_dict(),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "loss":                loss,
+        "config_name":         args.config,
+    }, path)
+    print(f"\n  [CKPT] Saved checkpoint: {path}  (step={step}, loss={loss:.4f})")
+def load_checkpoint(run_dir: str, model: SLLM, optimizer, device):
+    """Loads the latest checkpoint from run_dir. Returns step number."""
+    ckpts = sorted([
+        f for f in os.listdir(run_dir)
+        if f.startswith("ckpt_") and f.endswith(".pt")
+    ])
+    if not ckpts:
+        raise FileNotFoundError(f"No checkpoints found in {run_dir}")
+    path  = os.path.join(run_dir, ckpts[-1])
+    ckpt  = torch.load(path, map_location=device, weights_only=False)
+    model.load_state_dict(ckpt["model_state_dict"])
+    optimizer.load_state_dict(ckpt["optimizer_state_dict"])
+    step = ckpt["step"]
+    loss = ckpt.get("loss", float("nan"))
+    print(f"  [CKPT] Resumed from: {path}  (step={step}, loss={loss:.4f})")
+    return step
+# ------------------------------------------------------------------ #
+#  VALIDATION
+# ------------------------------------------------------------------ #
+@torch.no_grad()
+def estimate_val_loss(model, val_loader, val_steps: int, device, dtype_ctx) -> float:
+    model.eval()
+    losses = []
+    for i, (x, y) in enumerate(val_loader):
+        if i >= val_steps:
+            break
+        x, y = x.to(device), y.to(device)
+        with dtype_ctx:
+            _, loss = model(x, y)
+        losses.append(loss.item())
+    model.train()
+    return sum(losses) / len(losses) if losses else float("nan")
+# ------------------------------------------------------------------ #
+#  METRIC LOGGING
+# ------------------------------------------------------------------ #
+class MetricLogger:
+    """Appends one JSON line per step to train_log.jsonl."""
+    def __init__(self, log_path: str):
+        self.log_path = log_path
+        os.makedirs(os.path.dirname(log_path), exist_ok=True)
+        # Don't clear existing log when resuming — append
+        print(f"  [LOG] Logging to: {log_path}")
+    def log(self, **kwargs):
+        with open(self.log_path, "a") as f:
+            f.write(json.dumps(kwargs) + "\n")
+# ------------------------------------------------------------------ #
+#  MAIN TRAINING LOOP
+# ------------------------------------------------------------------ #
+def train():
+    args   = parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\nDevice  : {device}")
+    if device.type == "cuda":
+        print(f"GPU     : {torch.cuda.get_device_name(0)}")
+        print(f"VRAM    : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    # ---- dtype context --------------------------------------------- #
+    if args.dtype == "bf16" and device.type == "cuda" and torch.cuda.is_bf16_supported():
+        dtype_torch = torch.bfloat16
+        dtype_name  = "bf16"
+    elif args.dtype == "fp16" and device.type == "cuda":
+        dtype_torch = torch.float16
+        dtype_name  = "fp16"
+    else:
+        dtype_torch = torch.float32
+        dtype_name  = "fp32"
+    print(f"dtype   : {dtype_name}")
+    use_amp   = dtype_torch in (torch.float16, torch.bfloat16)
+    dtype_ctx = autocast(device_type=device.type, dtype=dtype_torch) if use_amp else torch.no_grad().__class__()
+    scaler    = GradScaler(enabled=(dtype_torch == torch.float16))  # bf16 doesn't need scaler
+    # ---- Auto-detect config on resume ------------------------------ #
+    if args.resume:
+        try:
+            ckpts = sorted([
+                f for f in os.listdir(args.run_dir)
+                if f.startswith("ckpt_") and f.endswith(".pt")
+            ])
+            if ckpts:
+                ckpt_path = os.path.join(args.run_dir, ckpts[-1])
+                _tmp_ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+                if "config_name" in _tmp_ckpt and _tmp_ckpt["config_name"] != args.config:
+                    print(f"  [CKPT] Auto-switching config from '{args.config}' to '{_tmp_ckpt['config_name']}' to match checkpoint.")
+                    args.config = _tmp_ckpt["config_name"]
+                del _tmp_ckpt
+        except Exception:
+            pass
+    # ---- Model ----------------------------------------------------- #
+    cfg_map = {"100M": SLLM_100M, "150M": SLLM_150M}
+    cfg     = cfg_map[args.config]
+    model   = SLLM(cfg).to(device)
+    if args.grad_checkpoint:
+        model.enable_gradient_checkpointing()
+        print("  Gradient checkpointing: ON")
+    print(f"\nModel   : SLLM-{args.config}  ({model.count_params()/1e6:.1f}M params)")
+    print(f"Config  : {cfg}")
+    # ---- Optimizer ------------------------------------------------- #
+    optimizer = build_optimizer(model, lr=args.max_lr, weight_decay=args.weight_decay)
+    # ---- Data ------------------------------------------------------ #
+    train_loader = build_dataloader(
+        data_dir       = args.data_dir,
+        split          = "train",
+        context_length = cfg.context_length,
+        batch_size     = args.batch_size,
+        num_workers    = args.num_workers,
+        use_synthetic  = args.synthetic,
+        vocab_size     = cfg.vocab_size,
+    )
+    val_loader = build_dataloader(
+        data_dir       = args.data_dir,
+        split          = "val",
+        context_length = cfg.context_length,
+        batch_size     = args.batch_size,
+        num_workers    = 0,
+        use_synthetic  = args.synthetic,
+        vocab_size     = cfg.vocab_size,
+    )
+    # ---- Run directory --------------------------------------------- #
+    os.makedirs(args.run_dir, exist_ok=True)
+    log_path = os.path.join(args.run_dir, "train_log.jsonl")
+    logger   = MetricLogger(log_path)
+    # ---- Resume ---------------------------------------------------- #
+    start_step = 0
+    if args.resume:
+        try:
+            start_step = load_checkpoint(args.run_dir, model, optimizer, device)
+        except FileNotFoundError as e:
+            print(f"  [WARN] {e} — starting from scratch.")
+    # ---- Effective batch size info --------------------------------- #
+    eff_batch = args.batch_size * args.grad_accum
+    tokens_per_step = eff_batch * cfg.context_length
+    print(f"\nTraining:")
+    # ---- Resolve extra_steps -> max_steps -------------------------- #
+    if args.extra_steps is not None:
+        if args.max_steps is not None:
+            print("  [WARN] Both --extra_steps and --max_steps given. --extra_steps takes priority.")
+        args.max_steps = start_step + args.extra_steps
+        print(f"  [INFO] --extra_steps {args.extra_steps} → running until step {args.max_steps}")
+    print(f"  batch_size      : {args.batch_size} (grad_accum={args.grad_accum} -> effective={eff_batch})")
+    print(f"  tokens/step     : {tokens_per_step:,}")
+    print(f"  max_steps       : {args.max_steps or 'unlimited'} (absolute step target)")
+    print(f"  start_step      : {start_step}")
+    print(f"  steps to run    : {(args.max_steps - start_step) if args.max_steps else 'unlimited'}")
+    print(f"  save_every      : {args.save_every}")
+    print(f"  log_every       : {args.log_every}")
+    # ---- Early exit if already past max_steps ---------------------- #
+    if args.max_steps is not None and start_step >= args.max_steps:
+        print(f"\n  [WARN] start_step ({start_step}) >= max_steps ({args.max_steps}).")
+        print(f"         Nothing to train. Use --extra_steps N to run N more steps.")
+        print(f"\nExample: python train.py --resume --run_dir {args.run_dir} --extra_steps 5000")
+        return
+    # ---- Graceful Ctrl+C handler ----------------------------------- #
+    stop_flag = {"stop": False}
+    def _signal_handler(sig, frame):
+        print("\n  [SIGNAL] Ctrl+C received — will save checkpoint and exit after current step.")
+        stop_flag["stop"] = True
+    signal.signal(signal.SIGINT, _signal_handler)
+    # ---- Training loop --------------------------------------------- #
+    model.train()
+    step           = start_step
+    micro_step     = 0      # within grad_accum window
+    running_loss   = 0.0    # accumulated for logging
+    t_start        = time.time()
+    t_step_start   = time.time()
+    data_iter      = iter(train_loader)
+    print(f"\n{'='*60}")
+    print(f"  TRAINING STARTED  (step {step} -> {args.max_steps or '∞'})")
+    print(f"{'='*60}\n")
+    pbar = tqdm(
+        initial=step,
+        total=args.max_steps,
+        desc="Training",
+        unit="step",
+        dynamic_ncols=True,
+    )
+    while True:
+        # ---- Stop conditions --------------------------------------- #
+        if stop_flag["stop"]:
+            break
+        if args.max_steps is not None and step >= args.max_steps:
+            print(f"\n  [DONE] Reached max_steps={args.max_steps}")
+            break
+        optimizer.zero_grad(set_to_none=True)
+        accum_loss = 0.0
+        # ---- Gradient accumulation micro-steps --------------------- #
+        for micro in range(args.grad_accum):
+            # Get next batch
+            try:
+                x, y = next(data_iter)
+            except StopIteration:
+                data_iter = iter(train_loader)
+                x, y = next(data_iter)
+            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+            # Forward + loss (inside AMP context)
+            with autocast(device_type=device.type, dtype=dtype_torch, enabled=use_amp):
+                logits, loss = model(x, y)
+                # Scale loss by grad_accum so gradients average correctly
+                loss = loss / args.grad_accum
+            # Backward
+            scaler.scale(loss).backward()
+            accum_loss += loss.item()
+        # ---- Gradient clipping ------------------------------------- #
+        if args.grad_clip > 0:
+            scaler.unscale_(optimizer)
+            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        else:
+            grad_norm = float("nan")
+        # ---- LR update --------------------------------------------- #
+        lr = get_lr(step, args.warmup_steps, args.max_steps, args.max_lr, args.min_lr)
+        for pg in optimizer.param_groups:
+            pg["lr"] = lr
+        # ---- Optimizer step ---------------------------------------- #
+        scaler.step(optimizer)
+        scaler.update()
+        step += 1
+        running_loss = accum_loss   # loss for this step
+        # ---- Tokens per second ------------------------------------- #
+        t_now       = time.time()
+        elapsed     = t_now - t_step_start
+        t_step_start = t_now
+        tok_per_sec  = tokens_per_step / max(elapsed, 1e-6)
+        # ---- Progress bar update ----------------------------------- #
+        pbar.update(1)
+        pbar.set_postfix({
+            "loss": f"{running_loss:.4f}",
+            "lr":   f"{lr:.2e}",
+            "tok/s": f"{tok_per_sec:.0f}",
+        })
+        # ---- Logging ----------------------------------------------- #
+        if step % args.log_every == 0:
+            log_entry = {
+                "step":        step,
+                "loss":        round(running_loss, 6),
+                "lr":          lr,
+                "grad_norm":   round(float(grad_norm), 4) if not math.isnan(float(grad_norm)) else None,
+                "tok_per_sec": round(tok_per_sec, 1),
+                "elapsed_s":   round(t_now - t_start, 1),
+            }
+            if device.type == "cuda":
+                log_entry["vram_gb"] = round(torch.cuda.memory_allocated() / 1e9, 3)
+            logger.log(**log_entry)
+        # ---- Validation -------------------------------------------- #
+        if step % args.val_every == 0:
+            val_loss = estimate_val_loss(model, val_loader, args.val_steps, device, autocast(device_type=device.type, dtype=dtype_torch, enabled=use_amp))
+            tqdm.write(f"  [STEP {step:6d}] train_loss={running_loss:.4f}  val_loss={val_loss:.4f}  lr={lr:.2e}")
+            logger.log(step=step, val_loss=round(val_loss, 6))
+        # ---- Checkpoint -------------------------------------------- #
+        if step % args.save_every == 0:
+            ckpt_path = os.path.join(args.run_dir, f"ckpt_{step:07d}.pt")
+            save_checkpoint(ckpt_path, model, optimizer, step, args, running_loss)
+    # ---- Final checkpoint on exit (only if we actually ran steps) -- #
+    pbar.close()
+    steps_done = step - start_step
+    if steps_done > 0:
+        ckpt_path = os.path.join(args.run_dir, f"ckpt_{step:07d}.pt")
+        save_checkpoint(ckpt_path, model, optimizer, step, args, running_loss)
+    else:
+        print("\n  [SKIP] No steps were taken — skipping final checkpoint save.")
+    total_time = time.time() - t_start
+    print(f"\n{'='*60}")
+    print(f"  TRAINING COMPLETE")
+    print(f"{'='*60}")
+    print(f"  Steps completed  : {step - start_step}")
+    print(f"  Final loss       : {running_loss:.4f}")
+    print(f"  Total time       : {total_time/60:.1f} min")
+    print(f"  Run dir          : {args.run_dir}")
+    print(f"\nTo resume: python train.py --resume --run_dir {args.run_dir} --max_steps <N>")
+    print(f"To plot  : python plot_training.py --run_dir {args.run_dir}")
+if __name__ == "__main__":
+    train()