Upload 16 files

Browse files

Files changed (16) hide show

.gitattributes +1 -32
README.md +193 -0
WEIGHTS_GO_HERE.txt +3 -0
chat.py +339 -0
config.json +20 -0
config.py +157 -0
dataset.py +269 -0
model.py +513 -0
requirements.txt +3 -0
special_tokens_map.json +12 -0
tokenizer.py +344 -0
tokenizer_config.json +11 -0
train.py +456 -0
visual_nn_3d.py +387 -0
visual_nn_nodes.py +395 -0
visualize_nn.py +472 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.bin filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,193 @@

+---
+license: mit
+language:
+  - en
+tags:
+  - text-generation
+  - from-scratch
+  - transformer
+  - gpt
+  - pytorch
+  - chatbot
+pipeline_tag: text-generation
+model-index:
+  - name: GPT-300M
+    results: []
+---
+# GPT-300M
+A **334,808,064 parameter** autoregressive transformer language model built **entirely from scratch** in PyTorch. No pretrained weights. No fine-tuning. Everything from zero.
+## Architecture
+```
+Input Token IDs
+  ↓
+Token Embedding (32,000 × 1,024)     — 32.8M params
+  ↓
+Rotary Position Embeddings (RoPE)     — 0 learned params
+  ↓
+┌─────────────────────────────────────────────────────┐
+│  Transformer Block  × 24 layers  (12.6M each)      │
+│                                                     │
+│  RMSNorm → Multi-Head Attention → ⊕ Residual       │
+│             16 heads × 64d                          │
+│             4,194,304 params                        │
+│                                                     │
+│  RMSNorm → FFN (GELU) → ⊕ Residual                 │
+│             1,024 → 4,096 → 1,024                   │
+│             8,388,608 params                        │
+└─────────────────────────────────────────────────────┘
+  ↓
+Final RMSNorm
+  ↓
+LM Head (weight-tied with embedding)  — 0 extra params
+  ↓
+Softmax → Next Token Probabilities
+```
+## Parameter Breakdown
+| Component | Parameters | Percentage |
+|---|---:|---:|
+| Token Embedding | 32,768,000 | 9.8% |
+| Attention Layers (×24) | 100,663,296 | 30.1% |
+| Feed-Forward Layers (×24) | 201,326,592 | 60.1% |
+| RMSNorm (×24 + final) | 50,176 | 0.0% |
+| LM Head | 0 (tied) | — |
+| **TOTAL** | **334,808,064** | **100%** |
+## Model Details
+| Hyperparameter | Value |
+|---|---|
+| Hidden dimension (d_model) | 1,024 |
+| Attention heads | 16 |
+| Head dimension | 64 |
+| Transformer layers | 24 |
+| FFN dimension (d_ff) | 4,096 |
+| Vocabulary size | 32,000 |
+| Max sequence length | 2,048 |
+| Position encoding | RoPE (θ=10,000) |
+| Activation | GELU |
+| Normalization | RMSNorm (ε=1e-5) |
+| Weight tying | Yes (Embed ↔ LM Head) |
+| Bias | None |
+## Training Configuration
+| Setting | Value |
+|---|---|
+| Optimizer | AdamW (β₁=0.9, β₂=0.95) |
+| Peak learning rate | 3e-4 |
+| Min learning rate | 3e-5 |
+| Schedule | Cosine decay + linear warmup |
+| Warmup steps | 2,000 |
+| Weight decay | 0.1 |
+| Batch size | 32 × 8 gradient accumulation |
+| Max training steps | 600,000 |
+| Precision | bfloat16 |
+| Gradient clipping | 1.0 |
+## Usage
+### Loading the Model
+```python
+from model import GPT300M
+from config import GPT300MConfig
+from tokenizer import BPETokenizer
+import torch
+# Load config, model, and tokenizer
+config = GPT300MConfig()
+model = GPT300M(config)
+# Load trained weights
+checkpoint = torch.load("pytorch_model.bin", map_location="cpu")
+model.load_state_dict(checkpoint)
+model.eval()
+# Load tokenizer
+tokenizer = BPETokenizer.load("tokenizer.json")
+```
+### Chat with the Model
+```python
+from chat import ChatBot
+chatbot = ChatBot(model, tokenizer, config)
+response = chatbot.chat("Hello! What is machine learning?")
+print(response)
+```
+### Interactive Chat
+```bash
+python chat.py --checkpoint pytorch_model.bin
+```
+### Training from Scratch
+```bash
+# Quick test (tiny model)
+python train.py --tiny
+# Full 300M model
+python train.py --data your_training_data.txt
+# Multi-GPU
+torchrun --nproc_per_node=4 train.py --data your_data.txt
+```
+## Files
+| File | Description |
+|---|---|
+| `config.json` | Model configuration (HuggingFace format) |
+| `config.py` | Python config class with all hyperparameters |
+| `model.py` | Full transformer architecture (RoPE, MHA, FFN, KV-cache) |
+| `tokenizer.py` | BPE tokenizer built from scratch |
+| `tokenizer_config.json` | Tokenizer settings |
+| `special_tokens_map.json` | Special token definitions |
+| `dataset.py` | Dataset classes and data loading |
+| `train.py` | Training loop (DDP, mixed precision, scheduling) |
+| `chat.py` | Interactive chatbot with streaming generation |
+| `visual_nn_3d.py` | 3D matplotlib architecture visualization |
+| `requirements.txt` | Python dependencies |
+| `pytorch_model.bin` | Trained weights *(upload after training)* |
+| `tokenizer.json` | Trained tokenizer *(upload after training)* |
+## Hardware Requirements
+| Config | GPU Memory | Est. Training Time |
+|---|---|---|
+| Tiny (debug) | ~1 GB | Minutes |
+| Full 300M | ~24 GB | ~3-5 days (4×A100) |
+## Key Features
+- **100% from scratch** — no pretrained weights, no HuggingFace Transformers dependency
+- **Rotary Position Embeddings** — better length generalization than learned positions
+- **RMSNorm** — faster than LayerNorm, equally effective
+- **Flash Attention** — via PyTorch 2.0 SDPA
+- **KV-Cache** — efficient autoregressive generation
+- **Weight tying** — saves ~33M parameters
+- **Chat template** — built-in support for multi-turn conversations
+- **torch.compile** — ready for PyTorch 2.0+ compilation
+## Citation
+```bibtex
+@misc{gpt300m,
+  title={GPT-300M: A 300-Million Parameter Language Model From Scratch},
+  year={2025},
+  url={https://huggingface.co/YOUR_USERNAME/gpt-300m}
+}
+```
+## License
+MIT

WEIGHTS_GO_HERE.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+PLACEHOLDER - Replace this file with your trained model weights after training.
+Run: python train.py --data your_data.txt
+Then: torch.save(checkpoint['model_state_dict'], 'pytorch_model.bin')

chat.py ADDED Viewed

	@@ -0,0 +1,339 @@

+"""
+GPT-300M Chatbot Interface
+============================
+Interactive terminal chatbot using a trained GPT-300M model.
+Usage:
+  python chat.py --checkpoint ./checkpoints/best_model.pt
+  # Or with custom generation parameters:
+  python chat.py --checkpoint ./checkpoints/best_model.pt \
+                 --temperature 0.8 --top_k 40 --max_tokens 256
+"""
+import argparse
+import sys
+import time
+from typing import List, Dict, Optional
+import torch
+from config import GPT300MConfig
+from model import GPT300M
+from tokenizer import BPETokenizer
+class ChatBot:
+    """
+    Interactive chatbot powered by GPT-300M.
+    Maintains conversation history, handles tokenization/detokenization,
+    and performs autoregressive generation with KV-caching.
+    """
+    def __init__(
+        self,
+        model: GPT300M,
+        tokenizer: BPETokenizer,
+        config: GPT300MConfig,
+        device: str = "auto",
+    ):
+        self.config = config
+        self.tokenizer = tokenizer
+        # Device
+        if device == "auto":
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = device
+        self.model = model.to(self.device)
+        self.model.eval()
+        # Conversation state
+        self.history: List[Dict[str, str]] = []
+        self.system_prompt = config.system_prompt
+    def set_system_prompt(self, prompt: str):
+        """Set the system prompt for the conversation."""
+        self.system_prompt = prompt
+    def reset(self):
+        """Clear conversation history."""
+        self.history = []
+        print("\n✦ Conversation reset.\n")
+    def chat(
+        self,
+        user_message: str,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        max_new_tokens: Optional[int] = None,
+        stream: bool = True,
+    ) -> str:
+        """
+        Send a message and get a response.
+        Args:
+            user_message: The user's input
+            temperature: Override sampling temperature
+            top_k: Override top-k
+            top_p: Override top-p
+            max_new_tokens: Override max generation length
+            stream: Whether to stream tokens to stdout
+        Returns:
+            The assistant's response text
+        """
+        temp = temperature or self.config.temperature
+        k = top_k or self.config.top_k
+        p = top_p or self.config.top_p
+        max_tokens = max_new_tokens or self.config.max_new_tokens
+        # Build conversation messages
+        messages = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.extend(self.history)
+        messages.append({"role": "user", "content": user_message})
+        # Tokenize
+        input_ids = self.tokenizer.encode_chat(messages, add_generation_prompt=True)
+        input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+        # Check sequence length
+        if input_tensor.size(1) > self.config.max_seq_len - max_tokens:
+            # Truncate history if needed
+            while (
+                len(self.history) > 0
+                and input_tensor.size(1) > self.config.max_seq_len - max_tokens
+            ):
+                self.history.pop(0)
+                messages = []
+                if self.system_prompt:
+                    messages.append({"role": "system", "content": self.system_prompt})
+                messages.extend(self.history)
+                messages.append({"role": "user", "content": user_message})
+                input_ids = self.tokenizer.encode_chat(messages, add_generation_prompt=True)
+                input_tensor = torch.tensor([input_ids], dtype=torch.long, device=self.device)
+        # Generate
+        t0 = time.time()
+        if stream:
+            response_text = self._generate_streaming(
+                input_tensor, max_tokens, temp, k, p
+            )
+        else:
+            with torch.no_grad():
+                output_ids = self.model.generate(
+                    input_tensor,
+                    max_new_tokens=max_tokens,
+                    temperature=temp,
+                    top_k=k,
+                    top_p=p,
+                    repetition_penalty=self.config.repetition_penalty,
+                    eos_token_id=self.tokenizer.special_tokens.get("<|end|>"),
+                )
+            # Decode only the new tokens
+            new_ids = output_ids[0, input_tensor.size(1):].tolist()
+            response_text = self.tokenizer.decode(new_ids, skip_special=True)
+        dt = time.time() - t0
+        n_tokens = len(self.tokenizer.encode(response_text))
+        # Update history
+        self.history.append({"role": "user", "content": user_message})
+        self.history.append({"role": "assistant", "content": response_text.strip()})
+        if stream:
+            print(f"\n  [{n_tokens} tokens, {dt:.1f}s, {n_tokens/dt:.1f} tok/s]")
+        return response_text.strip()
+    @torch.no_grad()
+    def _generate_streaming(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+    ) -> str:
+        """Generate tokens one at a time, printing as we go."""
+        import torch.nn.functional as F
+        model = self.model
+        model.eval()
+        eos_id = self.tokenizer.special_tokens.get("<|end|>")
+        end_id = self.tokenizer.special_tokens.get("<eos>")
+        # Initial forward pass
+        logits, _, kv_caches = model(input_ids, use_cache=True)
+        generated_ids = []
+        buffer = b""
+        for step in range(max_new_tokens):
+            next_logits = logits[:, -1, :]
+            # Repetition penalty
+            if self.config.repetition_penalty != 1.0:
+                for tid in set(generated_ids):
+                    if next_logits[0, tid] > 0:
+                        next_logits[0, tid] /= self.config.repetition_penalty
+                    else:
+                        next_logits[0, tid] *= self.config.repetition_penalty
+            # Temperature + sampling
+            if temperature > 0:
+                next_logits = next_logits / temperature
+                if top_k > 0:
+                    topk_vals, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
+                    next_logits[next_logits < topk_vals[:, -1:]] = float("-inf")
+                probs = F.softmax(next_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                next_token = next_logits.argmax(dim=-1, keepdim=True)
+            token_id = next_token.item()
+            # Check for stop tokens
+            if token_id in (eos_id, end_id):
+                break
+            generated_ids.append(token_id)
+            # Decode and print the new token
+            token_bytes = self.tokenizer.vocab.get(token_id, b"")
+            buffer += token_bytes
+            try:
+                text = buffer.decode("utf-8")
+                sys.stdout.write(text)
+                sys.stdout.flush()
+                buffer = b""
+            except UnicodeDecodeError:
+                pass  # Wait for more bytes
+            # Forward with KV-cache
+            position_offset = input_ids.size(1) + step
+            logits, _, kv_caches = model(
+                next_token,
+                kv_caches=kv_caches,
+                use_cache=True,
+                position_offset=position_offset,
+            )
+        # Flush remaining buffer
+        if buffer:
+            text = buffer.decode("utf-8", errors="replace")
+            sys.stdout.write(text)
+            sys.stdout.flush()
+        return self.tokenizer.decode(generated_ids, skip_special=True)
+def interactive_chat(chatbot: ChatBot):
+    """Run an interactive chat session in the terminal."""
+    print("=" * 60)
+    print("  GPT-300M Chatbot")
+    print("  Type 'quit' to exit, 'reset' to clear history")
+    print("  Type 'system: <prompt>' to set system prompt")
+    print("=" * 60)
+    print()
+    while True:
+        try:
+            user_input = input("You: ").strip()
+        except (KeyboardInterrupt, EOFError):
+            print("\n\nGoodbye!")
+            break
+        if not user_input:
+            continue
+        if user_input.lower() == "quit":
+            print("Goodbye!")
+            break
+        if user_input.lower() == "reset":
+            chatbot.reset()
+            continue
+        if user_input.lower().startswith("system:"):
+            prompt = user_input[7:].strip()
+            chatbot.set_system_prompt(prompt)
+            print(f"✦ System prompt set: {prompt}\n")
+            continue
+        print("\nAssistant: ", end="", flush=True)
+        chatbot.chat(user_input, stream=True)
+        print()
+def load_model(checkpoint_path: str, device: str = "auto"):
+    """Load a trained model from checkpoint."""
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    # Reconstruct config
+    config = GPT300MConfig(**checkpoint["config"])
+    # Load model
+    model = GPT300M(config)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    # Load tokenizer
+    tokenizer_path = os.path.join(
+        os.path.dirname(checkpoint_path), "tokenizer.json"
+    )
+    if os.path.exists(tokenizer_path):
+        tokenizer = BPETokenizer.load(tokenizer_path)
+    else:
+        tokenizer = BPETokenizer(vocab_size=config.vocab_size)
+        print("Warning: Tokenizer not found, using untrained tokenizer")
+    return model, tokenizer, config
+# ═══════════════════════════════════════════════════════��═══════════════
+#  MAIN
+# ═══════════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    import os
+    parser = argparse.ArgumentParser(description="GPT-300M Chatbot")
+    parser.add_argument("--checkpoint", type=str, default=None,
+                        help="Path to model checkpoint")
+    parser.add_argument("--temperature", type=float, default=0.7)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--max_tokens", type=int, default=512)
+    parser.add_argument("--device", type=str, default="auto")
+    args = parser.parse_args()
+    if args.checkpoint and os.path.exists(args.checkpoint):
+        model, tokenizer, config = load_model(args.checkpoint, args.device)
+    else:
+        print("No checkpoint provided. Initializing random model for demo...")
+        from config import gpt_tiny
+        config = gpt_tiny()
+        model = GPT300M(config)
+        tokenizer = BPETokenizer(vocab_size=config.vocab_size)
+        # Quick train on minimal data
+        tokenizer.train("Hello! How are you? I am fine. " * 100)
+    config.temperature = args.temperature
+    config.top_k = args.top_k
+    config.top_p = args.top_p
+    config.max_new_tokens = args.max_tokens
+    chatbot = ChatBot(model, tokenizer, config, device=args.device)
+    interactive_chat(chatbot)

config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "architectures": ["GPT300M"],
+  "model_type": "gpt-300m",
+  "vocab_size": 32000,
+  "max_position_embeddings": 2048,
+  "hidden_size": 1024,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "intermediate_size": 4096,
+  "hidden_act": "gelu",
+  "dropout": 0.1,
+  "attention_dropout": 0.1,
+  "use_bias": false,
+  "tie_word_embeddings": true,
+  "rope_theta": 10000.0,
+  "rms_norm_eps": 1e-5,
+  "torch_dtype": "bfloat16",
+  "total_params": 334808064,
+  "total_params_trainable": 334808064
+}

config.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+GPT-300M Configuration
+======================
+A ~300 million parameter autoregressive transformer language model.
+Built entirely from scratch — no pretrained weights, no fine-tuning.
+Parameter budget breakdown:
+  - Token Embeddings:   vocab_size × d_model         = 32,000 × 1,024  = 32.8M
+  - Position Embeddings: max_seq_len × d_model        = 2,048 × 1,024   = 2.1M
+  - Transformer Layers (×24):
+      - Multi-Head Attention (Q/K/V/O): 4 × d_model²  = 4 × 1,048,576  = 4.2M each
+      - Feed-Forward Network: 2 × d_model × d_ff      = 2 × 1,024 × 4,096 = 8.4M each
+      - LayerNorms: negligible
+      - Per layer total: ~12.6M
+      - All 24 layers: ~302M
+  - Final LayerNorm + LM Head (tied with embeddings): ~0
+  ─────────────────────────────────────────────────────
+  TOTAL: ~337M parameters (LM head weight-tied → ~304M unique)
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+import json
+import os
+@dataclass
+class GPT300MConfig:
+    """Configuration for a ~300M parameter GPT model."""
+    # ── Model Architecture ──────────────────────────────────────────────
+    vocab_size: int = 32_000           # BPE vocabulary size
+    max_seq_len: int = 2_048           # Maximum sequence length (context window)
+    d_model: int = 1_024               # Hidden dimension / embedding size
+    n_heads: int = 16                  # Number of attention heads
+    n_layers: int = 24                 # Number of transformer blocks
+    d_ff: int = 4_096                  # Feed-forward intermediate dimension
+    dropout: float = 0.1               # Dropout probability
+    bias: bool = False                 # Use bias in linear layers (modern GPTs skip this)
+    tie_weights: bool = True           # Tie token embedding and LM head weights
+    activation: str = "gelu"           # Activation function: "gelu" or "swiglu"
+    norm_eps: float = 1e-5             # LayerNorm epsilon
+    rope: bool = True                  # Use Rotary Position Embeddings (RoPE)
+    rope_theta: float = 10_000.0       # RoPE base frequency
+    # ── Training Hyperparameters ────────────────────────────────────────
+    batch_size: int = 32               # Micro-batch size per GPU
+    gradient_accumulation_steps: int = 8  # Effective batch = batch_size × grad_accum × n_gpus
+    learning_rate: float = 3e-4        # Peak learning rate
+    min_learning_rate: float = 3e-5    # Minimum LR after cosine decay
+    weight_decay: float = 0.1          # AdamW weight decay
+    beta1: float = 0.9                 # Adam beta1
+    beta2: float = 0.95                # Adam beta2
+    max_grad_norm: float = 1.0         # Gradient clipping norm
+    warmup_steps: int = 2_000          # Linear warmup steps
+    max_steps: int = 600_000           # Total training steps
+    eval_interval: int = 1_000         # Evaluate every N steps
+    save_interval: int = 5_000         # Save checkpoint every N steps
+    log_interval: int = 10             # Log metrics every N steps
+    # ── Data ────────────────────────────────────────────────────────────
+    data_dir: str = "./data"           # Directory containing tokenized .bin shards
+    train_split: float = 0.98          # Train/val split ratio
+    num_workers: int = 4               # DataLoader workers
+    # ── System ──────────────────────────────────────────────────────────
+    device: str = "auto"               # "auto", "cuda", "cpu", "mps"
+    dtype: str = "bfloat16"            # "float32", "float16", "bfloat16"
+    compile_model: bool = True         # Use torch.compile (PyTorch 2.0+)
+    output_dir: str = "./checkpoints"  # Where to save checkpoints
+    wandb_project: str = "gpt-300m"    # Weights & Biases project name
+    wandb_run_name: Optional[str] = None
+    seed: int = 42
+    # ── Chat / Inference ────────────────────────────────────────────────
+    temperature: float = 0.7           # Sampling temperature
+    top_k: int = 50                    # Top-k sampling
+    top_p: float = 0.9                 # Nucleus sampling threshold
+    max_new_tokens: int = 512          # Max tokens to generate per turn
+    repetition_penalty: float = 1.1    # Penalize repeated tokens
+    chat_template: str = (
+        "<|system|>{system}<|end|>"
+        "<|user|>{user}<|end|>"
+        "<|assistant|>"
+    )
+    system_prompt: str = (
+        "You are a helpful, harmless, and honest AI assistant. "
+        "Respond naturally and conversationally."
+    )
+    # ── Special Token IDs (set during tokenizer init) ───────────────────
+    pad_token_id: int = 0
+    bos_token_id: int = 1
+    eos_token_id: int = 2
+    @property
+    def head_dim(self) -> int:
+        assert self.d_model % self.n_heads == 0
+        return self.d_model // self.n_heads
+    @property
+    def total_params_estimate(self) -> int:
+        emb = self.vocab_size * self.d_model
+        pos = self.max_seq_len * self.d_model if not self.rope else 0
+        attn = 4 * self.d_model * self.d_model * self.n_layers
+        ffn = 2 * self.d_model * self.d_ff * self.n_layers
+        ln = 2 * self.d_model * self.n_layers + self.d_model
+        tied = 0 if self.tie_weights else self.vocab_size * self.d_model
+        return emb + pos + attn + ffn + ln + tied
+    def save(self, path: str):
+        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+        with open(path, "w") as f:
+            json.dump(self.__dict__, f, indent=2)
+    @classmethod
+    def load(cls, path: str) -> "GPT300MConfig":
+        with open(path) as f:
+            return cls(**json.load(f))
+    def __post_init__(self):
+        assert self.d_model % self.n_heads == 0, (
+            f"d_model ({self.d_model}) must be divisible by n_heads ({self.n_heads})"
+        )
+# ── Preset Configs ──────────────────────────────────────────────────────
+def gpt_300m() -> GPT300MConfig:
+    """Default 300M config."""
+    return GPT300MConfig()
+def gpt_125m() -> GPT300MConfig:
+    """Smaller 125M config for testing."""
+    return GPT300MConfig(
+        d_model=768, n_heads=12, n_layers=12, d_ff=3072,
+        max_seq_len=1024, batch_size=64
+    )
+def gpt_tiny() -> GPT300MConfig:
+    """Tiny config for debugging."""
+    return GPT300MConfig(
+        d_model=128, n_heads=4, n_layers=4, d_ff=512,
+        vocab_size=1000, max_seq_len=256, batch_size=8
+    )
+if __name__ == "__main__":
+    cfg = gpt_300m()
+    print(f"GPT-300M Configuration")
+    print(f"  Estimated parameters: {cfg.total_params_estimate:,}")
+    print(f"  d_model: {cfg.d_model}")
+    print(f"  n_heads: {cfg.n_heads}")
+    print(f"  n_layers: {cfg.n_layers}")
+    print(f"  d_ff: {cfg.d_ff}")
+    print(f"  vocab_size: {cfg.vocab_size}")
+    print(f"  max_seq_len: {cfg.max_seq_len}")

dataset.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Dataset & DataLoader for GPT-300M
+==================================
+Handles loading, tokenizing, and batching text data for training.
+Supports two modes:
+  1. Pre-tokenized binary shards (fast, for large-scale training)
+  2. Raw text files (convenient, for small datasets)
+"""
+import glob
+import os
+import random
+from typing import List, Optional
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader, IterableDataset
+from config import GPT300MConfig
+class TextDataset(Dataset):
+    """
+    Simple dataset that loads raw text, tokenizes it, and creates
+    fixed-length training sequences.
+    """
+    def __init__(
+        self,
+        text: str,
+        tokenizer,
+        seq_len: int,
+        stride: Optional[int] = None,
+    ):
+        """
+        Args:
+            text: Raw text data
+            tokenizer: BPETokenizer instance
+            seq_len: Sequence length for training
+            stride: Sliding window stride (default: seq_len // 2)
+        """
+        self.seq_len = seq_len
+        self.stride = stride or seq_len // 2
+        # Tokenize the entire text
+        self.token_ids = tokenizer.encode(text, add_special_tokens=False)
+        self.token_ids = torch.tensor(self.token_ids, dtype=torch.long)
+        # Calculate number of sequences
+        self.n_sequences = max(0, (len(self.token_ids) - seq_len - 1) // self.stride + 1)
+    def __len__(self):
+        return self.n_sequences
+    def __getitem__(self, idx):
+        start = idx * self.stride
+        end = start + self.seq_len + 1  # +1 for target offset
+        chunk = self.token_ids[start:end]
+        x = chunk[:-1]    # Input:  tokens[0..seq_len-1]
+        y = chunk[1:]      # Target: tokens[1..seq_len]
+        return x, y
+class ChatDataset(Dataset):
+    """
+    Dataset for chat/conversation data.
+    Each sample is a multi-turn conversation formatted with special tokens.
+    """
+    def __init__(
+        self,
+        conversations: List[List[dict]],
+        tokenizer,
+        max_seq_len: int,
+    ):
+        """
+        Args:
+            conversations: List of conversations, each a list of
+                           {"role": "user"|"assistant"|"system", "content": "..."}
+            tokenizer: BPETokenizer instance
+            max_seq_len: Maximum sequence length
+        """
+        self.max_seq_len = max_seq_len
+        self.samples = []
+        for conv in conversations:
+            ids = tokenizer.encode_chat(conv, add_generation_prompt=False)
+            ids.append(tokenizer.special_tokens["<eos>"])
+            # Truncate if needed
+            if len(ids) > max_seq_len + 1:
+                ids = ids[:max_seq_len + 1]
+            if len(ids) >= 4:  # Minimum meaningful length
+                self.samples.append(torch.tensor(ids, dtype=torch.long))
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        tokens = self.samples[idx]
+        x = tokens[:-1]
+        y = tokens[1:]
+        return x, y
+class ShardedDataset(IterableDataset):
+    """
+    Efficient iterable dataset that streams from pre-tokenized binary shards.
+    Used for large-scale training where data doesn't fit in memory.
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        seq_len: int,
+        split: str = "train",
+        seed: int = 42,
+    ):
+        super().__init__()
+        self.seq_len = seq_len
+        self.seed = seed
+        # Find shard files
+        pattern = os.path.join(data_dir, f"{split}_*.bin")
+        self.shards = sorted(glob.glob(pattern))
+        if not self.shards:
+            raise FileNotFoundError(f"No shards found matching: {pattern}")
+        print(f"Found {len(self.shards)} {split} shards")
+    def __iter__(self):
+        rng = random.Random(self.seed)
+        shards = list(self.shards)
+        rng.shuffle(shards)
+        for shard_path in shards:
+            # Memory-map the shard for efficiency
+            data = np.memmap(shard_path, dtype=np.uint16, mode="r")
+            n_tokens = len(data)
+            n_chunks = n_tokens // (self.seq_len + 1)
+            # Random order within shard
+            indices = list(range(n_chunks))
+            rng.shuffle(indices)
+            for idx in indices:
+                start = idx * (self.seq_len + 1)
+                chunk = torch.from_numpy(
+                    data[start : start + self.seq_len + 1].astype(np.int64)
+                )
+                x = chunk[:-1]
+                y = chunk[1:]
+                yield x, y
+def collate_fn(batch, pad_id: int = 0):
+    """
+    Collate function that pads sequences to the same length within a batch.
+    """
+    xs, ys = zip(*batch)
+    max_len = max(x.size(0) for x in xs)
+    padded_x = torch.full((len(xs), max_len), pad_id, dtype=torch.long)
+    padded_y = torch.full((len(ys), max_len), pad_id, dtype=torch.long)
+    for i, (x, y) in enumerate(zip(xs, ys)):
+        padded_x[i, :x.size(0)] = x
+        padded_y[i, :y.size(0)] = y
+    return padded_x, padded_y
+def create_dataloaders(
+    config: GPT300MConfig,
+    tokenizer,
+    text: Optional[str] = None,
+    conversations: Optional[List[List[dict]]] = None,
+) -> tuple:
+    """
+    Create train and validation DataLoaders.
+    Supply either `text` for raw text training or `conversations` for chat training.
+    """
+    if text is not None:
+        # Split into train/val
+        split = int(len(text) * config.train_split)
+        train_text = text[:split]
+        val_text = text[split:]
+        train_ds = TextDataset(train_text, tokenizer, config.max_seq_len)
+        val_ds = TextDataset(val_text, tokenizer, config.max_seq_len)
+    elif conversations is not None:
+        split = int(len(conversations) * config.train_split)
+        train_convs = conversations[:split]
+        val_convs = conversations[split:]
+        train_ds = ChatDataset(train_convs, tokenizer, config.max_seq_len)
+        val_ds = ChatDataset(val_convs, tokenizer, config.max_seq_len)
+    else:
+        raise ValueError("Provide either `text` or `conversations`")
+    train_dl = DataLoader(
+        train_ds,
+        batch_size=config.batch_size,
+        shuffle=True,
+        collate_fn=lambda b: collate_fn(b, config.pad_token_id),
+        num_workers=config.num_workers,
+        pin_memory=True,
+        drop_last=True,
+    )
+    val_dl = DataLoader(
+        val_ds,
+        batch_size=config.batch_size,
+        shuffle=False,
+        collate_fn=lambda b: collate_fn(b, config.pad_token_id),
+        num_workers=config.num_workers,
+        pin_memory=True,
+    )
+    return train_dl, val_dl
+# ═══════════════════════════════════════════════════════════════════════
+#  UTILITIES: Tokenize and save to binary shards
+# ═══════════════════════════════════════════════════════════════════════
+def tokenize_to_shards(
+    text: str,
+    tokenizer,
+    output_dir: str,
+    shard_size: int = 100_000_000,  # ~100M tokens per shard
+    split: str = "train",
+):
+    """
+    Tokenize text and save to binary shards for efficient loading.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    tokens = tokenizer.encode(text, add_special_tokens=False)
+    shard_idx = 0
+    for start in range(0, len(tokens), shard_size):
+        end = min(start + shard_size, len(tokens))
+        chunk = np.array(tokens[start:end], dtype=np.uint16)
+        path = os.path.join(output_dir, f"{split}_{shard_idx:04d}.bin")
+        chunk.tofile(path)
+        shard_idx += 1
+    print(f"Saved {shard_idx} shards ({len(tokens):,} tokens) to {output_dir}")
+if __name__ == "__main__":
+    from tokenizer import BPETokenizer
+    # Quick test with synthetic data
+    tok = BPETokenizer(vocab_size=500)
+    sample_text = "Hello world! " * 1000
+    tok.train(sample_text)
+    ds = TextDataset(sample_text, tok, seq_len=64)
+    print(f"Dataset: {len(ds)} sequences of length 64")
+    x, y = ds[0]
+    print(f"Sample x: {x[:10]}")
+    print(f"Sample y: {y[:10]}")

model.py ADDED Viewed

	@@ -0,0 +1,513 @@

+"""
+GPT-300M Model Architecture
+============================
+A decoder-only transformer built entirely from scratch in PyTorch.
+Architecture features:
+  - Pre-LayerNorm transformer blocks
+  - Rotary Position Embeddings (RoPE)
+  - Multi-Head Self-Attention with causal masking
+  - GELU activation in feed-forward layers
+  - Optional weight tying (token embeddings ↔ LM head)
+  - KV-Cache for efficient autoregressive generation
+  - Flash Attention support (PyTorch 2.0+)
+"""
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from config import GPT300MConfig
+# ═══════════════════════════════════════════════════════════════════════
+#  ROTARY POSITION EMBEDDINGS (RoPE)
+# ═══════════════════════════════════════════════════════════════════════
+class RotaryEmbedding(nn.Module):
+    """Rotary Position Embedding (Su et al., 2021)."""
+    def __init__(self, dim: int, max_seq_len: int = 2048, theta: float = 10000.0):
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Precompute cos/sin tables
+        t = torch.arange(max_seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        emb = torch.cat([freqs, freqs], dim=-1)
+        self.register_buffer("cos_cached", emb.cos(), persistent=False)
+        self.register_buffer("sin_cached", emb.sin(), persistent=False)
+    def forward(self, seq_len: int, offset: int = 0):
+        return (
+            self.cos_cached[offset : offset + seq_len],
+            self.sin_cached[offset : offset + seq_len],
+        )
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Rotate the second half of the last dimension."""
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat([-x2, x1], dim=-1)
+def apply_rotary_emb(
+    q: torch.Tensor, k: torch.Tensor,
+    cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply rotary embeddings to query and key tensors."""
+    # cos/sin shape: [seq_len, head_dim] → [1, 1, seq_len, head_dim]
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    q_rot = q * cos + rotate_half(q) * sin
+    k_rot = k * cos + rotate_half(k) * sin
+    return q_rot, k_rot
+# ═══════════════════════════════════════════════════════════════════════
+#  RMSNORM (faster alternative to LayerNorm)
+# ═══════════════════════════════════════════════════════════════════════
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.float().pow(2).mean(-1, keepdim=True).add(self.eps).rsqrt()
+        return (x.float() * norm).type_as(x) * self.weight
+# ═══════════════════════════════════════════════════════════════════════
+#  MULTI-HEAD SELF-ATTENTION
+# ═══════════════════════════════════════════════════════════════════════
+class MultiHeadAttention(nn.Module):
+    """Multi-Head Self-Attention with causal masking and optional KV-cache."""
+    def __init__(self, config: GPT300MConfig):
+        super().__init__()
+        self.n_heads = config.n_heads
+        self.head_dim = config.head_dim
+        self.d_model = config.d_model
+        self.dropout = config.dropout
+        # Q, K, V projections (fused for efficiency)
+        self.qkv_proj = nn.Linear(config.d_model, 3 * config.d_model, bias=config.bias)
+        # Output projection
+        self.out_proj = nn.Linear(config.d_model, config.d_model, bias=config.bias)
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        # Check for Flash Attention support
+        self.flash_attn = hasattr(F, "scaled_dot_product_attention")
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, C = x.shape
+        # Project to Q, K, V
+        qkv = self.qkv_proj(x)
+        q, k, v = qkv.split(self.d_model, dim=-1)
+        # Reshape: [B, T, n_heads, head_dim] → [B, n_heads, T, head_dim]
+        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
+        # Apply RoPE
+        if cos is not None and sin is not None:
+            q, k = apply_rotary_emb(q, k, cos, sin)
+        # KV-Cache for generation
+        if kv_cache is not None:
+            k_prev, v_prev = kv_cache
+            k = torch.cat([k_prev, k], dim=2)
+            v = torch.cat([v_prev, v], dim=2)
+        new_cache = (k, v) if use_cache else None
+        # Attention
+        if self.flash_attn and not use_cache:
+            # Use PyTorch's efficient SDPA
+            attn_out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=True if mask is None else False,
+            )
+        else:
+            # Manual attention for compatibility / KV-cache
+            scale = 1.0 / math.sqrt(self.head_dim)
+            scores = torch.matmul(q, k.transpose(-2, -1)) * scale
+            if mask is not None:
+                scores = scores.masked_fill(mask == 0, float("-inf"))
+            else:
+                # Causal mask
+                T_q, T_k = q.size(2), k.size(2)
+                causal = torch.tril(torch.ones(T_q, T_k, device=x.device, dtype=torch.bool))
+                # For KV-cache, the causal mask must align with key length
+                causal = causal[-T:, :]  # last T rows
+                scores = scores.masked_fill(~causal.unsqueeze(0).unsqueeze(0), float("-inf"))
+            attn_weights = F.softmax(scores, dim=-1)
+            attn_weights = self.attn_dropout(attn_weights)
+            attn_out = torch.matmul(attn_weights, v)
+        # Reshape back and project
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, -1, self.d_model)
+        out = self.resid_dropout(self.out_proj(attn_out))
+        return out, new_cache
+# ═══════════════════════════════════════════════════════════════════════
+#  FEED-FORWARD NETWORK
+# ═══════════════════════════════════════════════════════════════════════
+class FeedForward(nn.Module):
+    """Position-wise Feed-Forward Network with GELU activation."""
+    def __init__(self, config: GPT300MConfig):
+        super().__init__()
+        self.up_proj = nn.Linear(config.d_model, config.d_ff, bias=config.bias)
+        self.down_proj = nn.Linear(config.d_ff, config.d_model, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+        if config.activation == "gelu":
+            self.act = nn.GELU()
+        elif config.activation == "swiglu":
+            self.gate_proj = nn.Linear(config.d_model, config.d_ff, bias=config.bias)
+            self.act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {config.activation}")
+        self.use_swiglu = config.activation == "swiglu"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_swiglu:
+            return self.dropout(self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x)))
+        else:
+            return self.dropout(self.down_proj(self.act(self.up_proj(x))))
+# ═══════════════════════════════════════════════════════════════════════
+#  TRANSFORMER BLOCK
+# ═══════════════════════════════════════════════════════════════════════
+class TransformerBlock(nn.Module):
+    """Pre-norm Transformer block: LayerNorm → Attention → Residual → LayerNorm → FFN → Residual."""
+    def __init__(self, config: GPT300MConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.ln1 = RMSNorm(config.d_model, eps=config.norm_eps)
+        self.attn = MultiHeadAttention(config)
+        self.ln2 = RMSNorm(config.d_model, eps=config.norm_eps)
+        self.ffn = FeedForward(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Pre-norm attention with residual
+        residual = x
+        x = self.ln1(x)
+        attn_out, new_cache = self.attn(x, cos, sin, mask, kv_cache, use_cache)
+        x = residual + attn_out
+        # Pre-norm FFN with residual
+        x = x + self.ffn(self.ln2(x))
+        return x, new_cache
+# ═══════════════════════════════════════════════════════════════════════
+#  GPT-300M: THE FULL MODEL
+# ═══════════════════════════════════════════════════════════════════════
+class GPT300M(nn.Module):
+    """
+    GPT-300M: A 300-million parameter autoregressive language model.
+    Architecture:
+        Token Embedding → [Transformer Block × 24] → RMSNorm → LM Head
+    Each Transformer Block:
+        RMSNorm → Multi-Head Attention (+ RoPE) → Residual
+        → RMSNorm → Feed-Forward (GELU) → Residual
+    """
+    def __init__(self, config: GPT300MConfig):
+        super().__init__()
+        self.config = config
+        # ── Embeddings ───────────────────────────────────────────────
+        self.token_emb = nn.Embedding(config.vocab_size, config.d_model)
+        self.drop = nn.Dropout(config.dropout)
+        # Rotary embeddings
+        if config.rope:
+            self.rotary = RotaryEmbedding(
+                config.head_dim, config.max_seq_len, config.rope_theta
+            )
+        else:
+            self.pos_emb = nn.Embedding(config.max_seq_len, config.d_model)
+        # ── Transformer Blocks ───────────────────────────────────────
+        self.layers = nn.ModuleList([
+            TransformerBlock(config, layer_idx=i)
+            for i in range(config.n_layers)
+        ])
+        # ── Output ───────────────────────────────────────────────────
+        self.ln_f = RMSNorm(config.d_model, eps=config.norm_eps)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Weight tying
+        if config.tie_weights:
+            self.lm_head.weight = self.token_emb.weight
+        # Initialize weights
+        self.apply(self._init_weights)
+        # Scale residual projections
+        for pn, p in self.named_parameters():
+            if pn.endswith("out_proj.weight") or pn.endswith("down_proj.weight"):
+                nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layers))
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        targets: Optional[torch.Tensor] = None,
+        kv_caches: Optional[list] = None,
+        use_cache: bool = False,
+        position_offset: int = 0,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[list]]:
+        """
+        Forward pass.
+        Args:
+            input_ids: [B, T] token indices
+            targets: [B, T] target token indices for loss computation
+            kv_caches: List of KV-cache tuples, one per layer
+            use_cache: Whether to return updated KV-caches
+            position_offset: Offset for position embeddings (for KV-cache generation)
+        Returns:
+            logits: [B, T, vocab_size]
+            loss: scalar loss if targets provided, else None
+            new_caches: Updated KV-caches if use_cache=True
+        """
+        B, T = input_ids.shape
+        assert T <= self.config.max_seq_len, (
+            f"Sequence length {T} exceeds max {self.config.max_seq_len}"
+        )
+        # Token embeddings
+        x = self.token_emb(input_ids)  # [B, T, d_model]
+        # Position information
+        if self.config.rope:
+            cos, sin = self.rotary(T, offset=position_offset)
+        else:
+            positions = torch.arange(position_offset, position_offset + T, device=input_ids.device)
+            x = x + self.pos_emb(positions)
+            cos, sin = None, None
+        x = self.drop(x)
+        # Transformer blocks
+        new_caches = [] if use_cache else None
+        for i, layer in enumerate(self.layers):
+            cache_i = kv_caches[i] if kv_caches is not None else None
+            x, new_cache = layer(x, cos, sin, kv_cache=cache_i, use_cache=use_cache)
+            if use_cache:
+                new_caches.append(new_cache)
+        # Final norm and LM head
+        x = self.ln_f(x)
+        logits = self.lm_head(x)  # [B, T, vocab_size]
+        # Loss
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(
+                logits.view(-1, self.config.vocab_size),
+                targets.view(-1),
+                ignore_index=self.config.pad_token_id,
+            )
+        return logits, loss, new_caches
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int = 256,
+        temperature: float = 0.7,
+        top_k: int = 50,
+        top_p: float = 0.9,
+        repetition_penalty: float = 1.1,
+        eos_token_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Autoregressive generation with KV-cache.
+        Args:
+            input_ids: [B, T] prompt token IDs
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Nucleus sampling threshold
+            repetition_penalty: Penalty for repeated tokens
+            eos_token_id: Stop generation when this token is produced
+        Returns:
+            [B, T + max_new_tokens] generated token IDs
+        """
+        self.eval()
+        B, T = input_ids.shape
+        device = input_ids.device
+        # Initial forward pass to populate KV-cache
+        logits, _, kv_caches = self.forward(input_ids, use_cache=True)
+        generated = input_ids
+        all_token_ids = input_ids.tolist()[0] if B == 1 else []
+        for step in range(max_new_tokens):
+            # Get logits for the last token
+            next_logits = logits[:, -1, :]  # [B, vocab_size]
+            # Repetition penalty
+            if repetition_penalty != 1.0 and B == 1:
+                for token_id in set(all_token_ids):
+                    if next_logits[0, token_id] > 0:
+                        next_logits[0, token_id] /= repetition_penalty
+                    else:
+                        next_logits[0, token_id] *= repetition_penalty
+            # Temperature
+            if temperature > 0:
+                next_logits = next_logits / temperature
+                # Top-k filtering
+                if top_k > 0:
+                    topk_vals, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
+                    next_logits[next_logits < topk_vals[:, -1:]] = float("-inf")
+                # Top-p (nucleus) filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_idx = torch.sort(next_logits, descending=True)
+                    cumprobs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                    sorted_mask = cumprobs - F.softmax(sorted_logits, dim=-1) >= top_p
+                    sorted_logits[sorted_mask] = float("-inf")
+                    next_logits = sorted_logits.scatter(1, sorted_idx, sorted_logits)
+                probs = F.softmax(next_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            else:
+                # Greedy
+                next_token = next_logits.argmax(dim=-1, keepdim=True)
+            generated = torch.cat([generated, next_token], dim=1)
+            if B == 1:
+                all_token_ids.append(next_token.item())
+            # Stop on EOS
+            if eos_token_id is not None and next_token.item() == eos_token_id:
+                break
+            # Forward pass with KV-cache (only the new token)
+            position_offset = generated.size(1) - 1
+            logits, _, kv_caches = self.forward(
+                next_token,
+                kv_caches=kv_caches,
+                use_cache=True,
+                position_offset=position_offset,
+            )
+        return generated
+    def count_parameters(self, trainable_only: bool = True) -> int:
+        """Count model parameters."""
+        if trainable_only:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return sum(p.numel() for p in self.parameters())
+    def model_summary(self) -> str:
+        """Print a human-readable model summary."""
+        total = self.count_parameters(trainable_only=False)
+        trainable = self.count_parameters(trainable_only=True)
+        lines = [
+            "=" * 60,
+            "  GPT-300M Model Summary",
+            "=" * 60,
+            f"  Total parameters:     {total:>15,}",
+            f"  Trainable parameters: {trainable:>15,}",
+            f"  d_model:              {self.config.d_model:>15}",
+            f"  n_heads:              {self.config.n_heads:>15}",
+            f"  n_layers:             {self.config.n_layers:>15}",
+            f"  d_ff:                 {self.config.d_ff:>15}",
+            f"  vocab_size:           {self.config.vocab_size:>15}",
+            f"  max_seq_len:          {self.config.max_seq_len:>15}",
+            f"  RoPE:                 {'Yes':>15}",
+            f"  Weight tying:         {'Yes' if self.config.tie_weights else 'No':>15}",
+            f"  Flash Attention:      {'Yes' if self.layers[0].attn.flash_attn else 'No':>15}",
+            "=" * 60,
+        ]
+        return "\n".join(lines)
+# ═══════════════════════════════════════════════════════════════════════
+#  QUICK TEST
+# ═══════════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    from config import gpt_tiny
+    # Use tiny config for testing
+    cfg = gpt_tiny()
+    model = GPT300M(cfg)
+    print(model.model_summary())
+    # Test forward pass
+    x = torch.randint(0, cfg.vocab_size, (2, 32))
+    targets = torch.randint(0, cfg.vocab_size, (2, 32))
+    logits, loss, _ = model(x, targets=targets)
+    print(f"\nForward pass OK: logits={logits.shape}, loss={loss.item():.4f}")
+    # Test generation
+    prompt = torch.randint(0, cfg.vocab_size, (1, 8))
+    gen = model.generate(prompt, max_new_tokens=16, temperature=0.8)
+    print(f"Generation OK: {gen.shape}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch>=2.0.0
+numpy>=1.24.0
+matplotlib>=3.7.0

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "pad_token": "<pad>",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "unk_token": "<unk>",
+  "additional_special_tokens": [
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|end|>"
+  ]
+}

tokenizer.py ADDED Viewed

	@@ -0,0 +1,344 @@

+"""
+Byte-Pair Encoding (BPE) Tokenizer — Built From Scratch
+========================================================
+A minimal but complete BPE tokenizer implementation.
+Supports training from raw text, encoding/decoding, and special chat tokens.
+For production use, you'd typically use SentencePiece or tiktoken,
+but this demonstrates the full tokenizer pipeline.
+"""
+import json
+import os
+import re
+from collections import Counter
+from typing import Dict, List, Optional, Tuple
+class BPETokenizer:
+    """
+    Byte-Pair Encoding tokenizer with special token support.
+    Special tokens:
+        <pad>       = 0    Padding token
+        <bos>       = 1    Beginning of sequence
+        <eos>       = 2    End of sequence
+        <unk>       = 3    Unknown token
+        <|system|>  = 4    System prompt delimiter
+        <|user|>    = 5    User turn delimiter
+        <|assistant|> = 6  Assistant turn delimiter
+        <|end|>     = 7    End of turn
+    """
+    SPECIAL_TOKENS = {
+        "<pad>": 0,
+        "<bos>": 1,
+        "<eos>": 2,
+        "<unk>": 3,
+        "<|system|>": 4,
+        "<|user|>": 5,
+        "<|assistant|>": 6,
+        "<|end|>": 7,
+    }
+    # Pre-tokenization regex (GPT-2 style)
+    PAT = re.compile(
+        r"""'s|'t|'re|'ve|'m|'ll|'d| ?\w+| ?\d+| ?[^\s\w\d]+|\s+(?!\S)|\s+""",
+        re.UNICODE,
+    )
+    def __init__(self, vocab_size: int = 32_000):
+        self.target_vocab_size = vocab_size
+        self.special_tokens = dict(self.SPECIAL_TOKENS)
+        self.num_special = len(self.special_tokens)
+        # Byte-level base vocab: map each byte (0-255) to a token ID
+        self.byte_to_id: Dict[int, int] = {
+            b: b + self.num_special for b in range(256)
+        }
+        self.id_to_byte: Dict[int, int] = {v: k for k, v in self.byte_to_id.items()}
+        # Merge rules learned during training
+        self.merges: List[Tuple[int, int]] = []
+        self.merge_to_id: Dict[Tuple[int, int], int] = {}
+        # Full vocab (built after training)
+        self.vocab: Dict[int, bytes] = {}
+        self._build_vocab()
+    def _build_vocab(self):
+        """Reconstruct the full vocabulary from merges."""
+        self.vocab = {}
+        # Special tokens
+        for tok, idx in self.special_tokens.items():
+            self.vocab[idx] = tok.encode("utf-8")
+        # Byte-level tokens
+        for b in range(256):
+            self.vocab[self.num_special + b] = bytes([b])
+        # Merged tokens
+        for (a, b), idx in self.merge_to_id.items():
+            self.vocab[idx] = self.vocab[a] + self.vocab[b]
+    @property
+    def vocab_size(self) -> int:
+        return len(self.vocab)
+    # ── Training ────────────────────────────────────────────────────
+    def train(self, text: str, verbose: bool = True):
+        """
+        Train BPE merges from raw text.
+        Args:
+            text: Raw training text
+            verbose: Print progress
+        """
+        if verbose:
+            print(f"Training BPE tokenizer (target vocab: {self.target_vocab_size:,})...")
+        # Pre-tokenize into words
+        words = re.findall(self.PAT, text)
+        # Convert each word to a tuple of byte token IDs
+        word_freqs: Counter = Counter()
+        for word in words:
+            byte_ids = tuple(self.byte_to_id[b] for b in word.encode("utf-8"))
+            word_freqs[byte_ids] += 1
+        current_vocab_size = self.num_special + 256
+        num_merges = self.target_vocab_size - current_vocab_size
+        for i in range(num_merges):
+            # Count adjacent pairs
+            pair_counts: Counter = Counter()
+            for word, freq in word_freqs.items():
+                for j in range(len(word) - 1):
+                    pair_counts[(word[j], word[j + 1])] += freq
+            if not pair_counts:
+                break
+            # Find most frequent pair
+            best_pair = pair_counts.most_common(1)[0][0]
+            new_id = current_vocab_size
+            # Register merge
+            self.merges.append(best_pair)
+            self.merge_to_id[best_pair] = new_id
+            # Apply merge to all words
+            new_word_freqs: Counter = Counter()
+            for word, freq in word_freqs.items():
+                new_word = self._apply_merge(word, best_pair, new_id)
+                new_word_freqs[new_word] += freq
+            word_freqs = new_word_freqs
+            current_vocab_size += 1
+            if verbose and (i + 1) % 1000 == 0:
+                print(f"  Merge {i + 1}/{num_merges}: "
+                      f"({best_pair[0]}, {best_pair[1]}) → {new_id}, "
+                      f"freq={pair_counts[best_pair]}")
+        self._build_vocab()
+        if verbose:
+            print(f"Done! Final vocab size: {self.vocab_size:,}")
+    @staticmethod
+    def _apply_merge(
+        word: Tuple[int, ...], pair: Tuple[int, int], new_id: int
+    ) -> Tuple[int, ...]:
+        """Apply a single merge rule to a word."""
+        result = []
+        i = 0
+        while i < len(word):
+            if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
+                result.append(new_id)
+                i += 2
+            else:
+                result.append(word[i])
+                i += 1
+        return tuple(result)
+    # ── Encoding ────────────────────────────────────────────────────
+    def encode(self, text: str, add_special_tokens: bool = False) -> List[int]:
+        """
+        Encode text to token IDs.
+        Args:
+            text: Input text
+            add_special_tokens: Whether to wrap with <bos>/<eos>
+        Returns:
+            List of token IDs
+        """
+        tokens = []
+        # Check for special tokens in the text
+        parts = self._split_special_tokens(text)
+        for part, is_special in parts:
+            if is_special:
+                tokens.append(self.special_tokens[part])
+            else:
+                # Pre-tokenize
+                words = re.findall(self.PAT, part)
+                for word in words:
+                    # Convert to byte IDs
+                    byte_ids = list(self.byte_to_id[b] for b in word.encode("utf-8"))
+                    # Apply merges in order
+                    for pair, new_id in zip(self.merges, range(self.num_special + 256, self.vocab_size)):
+                        i = 0
+                        while i < len(byte_ids) - 1:
+                            if (byte_ids[i], byte_ids[i + 1]) == pair:
+                                byte_ids[i] = new_id
+                                del byte_ids[i + 1]
+                            else:
+                                i += 1
+                    tokens.extend(byte_ids)
+        if add_special_tokens:
+            tokens = [self.special_tokens["<bos>"]] + tokens + [self.special_tokens["<eos>"]]
+        return tokens
+    def _split_special_tokens(self, text: str) -> List[Tuple[str, bool]]:
+        """Split text on special token boundaries."""
+        # Build regex to match special tokens
+        pattern = "|".join(re.escape(tok) for tok in self.special_tokens.keys())
+        if not pattern:
+            return [(text, False)]
+        parts = []
+        last_end = 0
+        for match in re.finditer(pattern, text):
+            if match.start() > last_end:
+                parts.append((text[last_end:match.start()], False))
+            parts.append((match.group(), True))
+            last_end = match.end()
+        if last_end < len(text):
+            parts.append((text[last_end:], False))
+        return parts
+    # ── Decoding ────────────────────────────────────────────────────
+    def decode(self, ids: List[int], skip_special: bool = True) -> str:
+        """
+        Decode token IDs to text.
+        Args:
+            ids: List of token IDs
+            skip_special: Whether to skip special tokens
+        Returns:
+            Decoded text string
+        """
+        byte_chunks = []
+        for idx in ids:
+            if idx in self.special_tokens.values():
+                if not skip_special:
+                    # Find the special token string
+                    for tok, tid in self.special_tokens.items():
+                        if tid == idx:
+                            byte_chunks.append(tok.encode("utf-8"))
+                            break
+            elif idx in self.vocab:
+                byte_chunks.append(self.vocab[idx])
+        return b"".join(byte_chunks).decode("utf-8", errors="replace")
+    # ── Chat Formatting ─────────────────────────────────────────────
+    def encode_chat(
+        self,
+        messages: List[Dict[str, str]],
+        add_generation_prompt: bool = True,
+    ) -> List[int]:
+        """
+        Encode a chat conversation into token IDs.
+        Args:
+            messages: List of {"role": "system"|"user"|"assistant", "content": "..."}
+            add_generation_prompt: Add the assistant turn start token at the end
+        Returns:
+            List of token IDs
+        """
+        tokens = [self.special_tokens["<bos>"]]
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "system":
+                tokens.append(self.special_tokens["<|system|>"])
+            elif role == "user":
+                tokens.append(self.special_tokens["<|user|>"])
+            elif role == "assistant":
+                tokens.append(self.special_tokens["<|assistant|>"])
+            tokens.extend(self.encode(content))
+            tokens.append(self.special_tokens["<|end|>"])
+        if add_generation_prompt:
+            tokens.append(self.special_tokens["<|assistant|>"])
+        return tokens
+    # ── Save / Load ─────────────────────────────────────────────────
+    def save(self, path: str):
+        """Save tokenizer to JSON."""
+        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+        data = {
+            "target_vocab_size": self.target_vocab_size,
+            "merges": self.merges,
+        }
+        with open(path, "w") as f:
+            json.dump(data, f)
+    @classmethod
+    def load(cls, path: str) -> "BPETokenizer":
+        """Load tokenizer from JSON."""
+        with open(path) as f:
+            data = json.load(f)
+        tok = cls(vocab_size=data["target_vocab_size"])
+        tok.merges = [tuple(m) for m in data["merges"]]
+        tok.merge_to_id = {
+            tuple(pair): idx
+            for idx, pair in enumerate(tok.merges, start=tok.num_special + 256)
+        }
+        tok._build_vocab()
+        return tok
+# ═══════════════════════════════════════════════════════════════════════
+#  QUICK TEST
+# ═══════════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    tok = BPETokenizer(vocab_size=500)
+    sample = (
+        "Hello, world! This is a test of the BPE tokenizer. "
+        "The quick brown fox jumps over the lazy dog. "
+        "Machine learning is fascinating and powerful. " * 20
+    )
+    tok.train(sample, verbose=True)
+    text = "Hello, world! Machine learning is great."
+    ids = tok.encode(text)
+    decoded = tok.decode(ids)
+    print(f"\nOriginal:  {text}")
+    print(f"Token IDs: {ids[:20]}...")
+    print(f"Decoded:   {decoded}")
+    # Test chat encoding
+    chat = [
+        {"role": "system", "content": "You are helpful."},
+        {"role": "user", "content": "Hello!"},
+    ]
+    chat_ids = tok.encode_chat(chat)
+    print(f"\nChat IDs: {chat_ids[:20]}...")
+    print(f"Chat decoded: {tok.decode(chat_ids, skip_special=False)}")

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "tokenizer_class": "BPETokenizer",
+  "model_type": "gpt-300m",
+  "vocab_size": 32000,
+  "model_max_length": 2048,
+  "padding_side": "right",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

train.py ADDED Viewed

	@@ -0,0 +1,456 @@

+"""
+GPT-300M Training Script
+=========================
+Full training pipeline with:
+  - Mixed-precision training (bf16/fp16)
+  - Gradient accumulation
+  - Cosine learning rate schedule with warmup
+  - Gradient clipping
+  - Periodic evaluation & checkpointing
+  - Distributed Data Parallel (DDP) support
+  - Weights & Biases logging
+  - torch.compile support
+Usage:
+  # Single GPU
+  python train.py
+  # Multi-GPU with DDP
+  torchrun --nproc_per_node=4 train.py
+  # With custom config
+  python train.py --d_model 768 --n_layers 12 --batch_size 64
+"""
+import argparse
+import math
+import os
+import sys
+import time
+from contextlib import nullcontext
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from config import GPT300MConfig, gpt_300m, gpt_tiny
+from model import GPT300M
+from tokenizer import BPETokenizer
+from dataset import TextDataset, ChatDataset, create_dataloaders, collate_fn
+# ═══════════════════════════════════════════════════════════════════════
+#  LEARNING RATE SCHEDULER
+# ═══════════════════════════════════════════════════════════════════════
+def get_lr(step: int, config: GPT300MConfig) -> float:
+    """Cosine decay with linear warmup."""
+    # Linear warmup
+    if step < config.warmup_steps:
+        return config.learning_rate * step / config.warmup_steps
+    # Cosine decay
+    if step > config.max_steps:
+        return config.min_learning_rate
+    decay_ratio = (step - config.warmup_steps) / (config.max_steps - config.warmup_steps)
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
+    return config.min_learning_rate + coeff * (config.learning_rate - config.min_learning_rate)
+# ═══════════════════════════════════════════════════════════════════════
+#  TRAINING LOOP
+# ═══════════════════════════════════════════════════════════════════════
+class Trainer:
+    """
+    Full-featured training loop for GPT-300M.
+    """
+    def __init__(self, config: GPT300MConfig, resume_from: Optional[str] = None):
+        self.config = config
+        self.setup_distributed()
+        self.setup_device()
+        self.setup_model()
+        self.setup_optimizer()
+        self.global_step = 0
+        self.best_val_loss = float("inf")
+        if resume_from:
+            self.load_checkpoint(resume_from)
+    def setup_distributed(self):
+        """Setup DDP if running in distributed mode."""
+        self.ddp = int(os.environ.get("RANK", -1)) != -1
+        if self.ddp:
+            dist.init_process_group(backend="nccl")
+            self.ddp_rank = int(os.environ["RANK"])
+            self.ddp_local_rank = int(os.environ["LOCAL_RANK"])
+            self.ddp_world_size = int(os.environ["WORLD_SIZE"])
+            self.master_process = self.ddp_rank == 0
+        else:
+            self.ddp_rank = 0
+            self.ddp_local_rank = 0
+            self.ddp_world_size = 1
+            self.master_process = True
+    def setup_device(self):
+        """Configure device and mixed precision."""
+        cfg = self.config
+        if cfg.device == "auto":
+            if torch.cuda.is_available():
+                self.device = f"cuda:{self.ddp_local_rank}" if self.ddp else "cuda"
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                self.device = "mps"
+            else:
+                self.device = "cpu"
+        else:
+            self.device = cfg.device
+        # Mixed precision context
+        if "cuda" in self.device:
+            if cfg.dtype == "bfloat16" and torch.cuda.is_bf16_supported():
+                self.dtype = torch.bfloat16
+                self.amp_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
+            elif cfg.dtype == "float16":
+                self.dtype = torch.float16
+                self.amp_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.float16)
+            else:
+                self.dtype = torch.float32
+                self.amp_ctx = nullcontext()
+            self.scaler = torch.amp.GradScaler("cuda", enabled=(cfg.dtype == "float16"))
+        else:
+            self.dtype = torch.float32
+            self.amp_ctx = nullcontext()
+            self.scaler = torch.amp.GradScaler(enabled=False)
+        if self.master_process:
+            print(f"Device: {self.device}, dtype: {cfg.dtype}")
+    def setup_model(self):
+        """Initialize or load model."""
+        self.model = GPT300M(self.config).to(self.device)
+        if self.master_process:
+            print(self.model.model_summary())
+        # Compile model (PyTorch 2.0+)
+        if self.config.compile_model and hasattr(torch, "compile"):
+            if self.master_process:
+                print("Compiling model with torch.compile...")
+            self.model = torch.compile(self.model)
+        # Wrap in DDP
+        if self.ddp:
+            self.model = DDP(self.model, device_ids=[self.ddp_local_rank])
+        self.raw_model = self.model.module if self.ddp else self.model
+    def setup_optimizer(self):
+        """Configure AdamW optimizer with weight decay."""
+        cfg = self.config
+        # Separate parameters: decay vs no-decay
+        decay_params = []
+        nodecay_params = []
+        for name, param in self.raw_model.named_parameters():
+            if not param.requires_grad:
+                continue
+            if param.dim() >= 2:
+                decay_params.append(param)
+            else:
+                nodecay_params.append(param)
+        optim_groups = [
+            {"params": decay_params, "weight_decay": cfg.weight_decay},
+            {"params": nodecay_params, "weight_decay": 0.0},
+        ]
+        # Use fused AdamW if available (faster on CUDA)
+        use_fused = "cuda" in self.device and hasattr(torch.optim, "_multi_tensor")
+        self.optimizer = torch.optim.AdamW(
+            optim_groups,
+            lr=cfg.learning_rate,
+            betas=(cfg.beta1, cfg.beta2),
+            fused="cuda" in self.device,
+        )
+        if self.master_process:
+            n_decay = sum(p.numel() for p in decay_params)
+            n_nodecay = sum(p.numel() for p in nodecay_params)
+            print(f"Optimizer: {n_decay:,} decay params, {n_nodecay:,} no-decay params")
+    @torch.no_grad()
+    def evaluate(self, val_loader) -> float:
+        """Run evaluation and return average loss."""
+        self.model.eval()
+        total_loss = 0.0
+        n_batches = 0
+        for x, y in val_loader:
+            x, y = x.to(self.device), y.to(self.device)
+            with self.amp_ctx:
+                _, loss, _ = self.model(x, targets=y)
+            total_loss += loss.item()
+            n_batches += 1
+            if n_batches >= 50:  # Limit eval batches
+                break
+        self.model.train()
+        return total_loss / max(n_batches, 1)
+    def save_checkpoint(self, path: Optional[str] = None):
+        """Save model checkpoint."""
+        if not self.master_process:
+            return
+        if path is None:
+            path = os.path.join(
+                self.config.output_dir,
+                f"checkpoint_step_{self.global_step}.pt",
+            )
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        checkpoint = {
+            "model_state_dict": self.raw_model.state_dict(),
+            "optimizer_state_dict": self.optimizer.state_dict(),
+            "config": self.config.__dict__,
+            "global_step": self.global_step,
+            "best_val_loss": self.best_val_loss,
+        }
+        torch.save(checkpoint, path)
+        print(f"  Saved checkpoint: {path}")
+    def load_checkpoint(self, path: str):
+        """Load model checkpoint."""
+        checkpoint = torch.load(path, map_location=self.device)
+        self.raw_model.load_state_dict(checkpoint["model_state_dict"])
+        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+        self.global_step = checkpoint.get("global_step", 0)
+        self.best_val_loss = checkpoint.get("best_val_loss", float("inf"))
+        if self.master_process:
+            print(f"Resumed from step {self.global_step}")
+    def train(self, train_loader, val_loader):
+        """
+        Main training loop.
+        """
+        cfg = self.config
+        model = self.model
+        optimizer = self.optimizer
+        model.train()
+        train_iter = iter(train_loader)
+        if self.master_process:
+            print(f"\n{'='*60}")
+            print(f"  Starting training")
+            print(f"  Effective batch size: {cfg.batch_size * cfg.gradient_accumulation_steps * self.ddp_world_size}")
+            print(f"  Max steps: {cfg.max_steps:,}")
+            print(f"{'='*60}\n")
+        t0 = time.time()
+        for step in range(self.global_step, cfg.max_steps):
+            self.global_step = step
+            # Update learning rate
+            lr = get_lr(step, cfg)
+            for param_group in optimizer.param_groups:
+                param_group["lr"] = lr
+            # ── Gradient Accumulation Loop ──────────────────────────
+            optimizer.zero_grad(set_to_none=True)
+            accumulated_loss = 0.0
+            for micro_step in range(cfg.gradient_accumulation_steps):
+                # Get next batch (cycle through data)
+                try:
+                    x, y = next(train_iter)
+                except StopIteration:
+                    train_iter = iter(train_loader)
+                    x, y = next(train_iter)
+                x, y = x.to(self.device), y.to(self.device)
+                # DDP sync only on last micro-step
+                if self.ddp:
+                    model.require_backward_grad_sync = (
+                        micro_step == cfg.gradient_accumulation_steps - 1
+                    )
+                # Forward pass with mixed precision
+                with self.amp_ctx:
+                    _, loss, _ = model(x, targets=y)
+                    loss = loss / cfg.gradient_accumulation_steps
+                accumulated_loss += loss.item()
+                # Backward pass
+                self.scaler.scale(loss).backward()
+            # Gradient clipping
+            if cfg.max_grad_norm > 0:
+                self.scaler.unscale_(optimizer)
+                grad_norm = nn.utils.clip_grad_norm_(
+                    model.parameters(), cfg.max_grad_norm
+                )
+            else:
+                grad_norm = 0.0
+            # Optimizer step
+            self.scaler.step(optimizer)
+            self.scaler.update()
+            # ── Logging ─────────────────────────────────────────────
+            if step % cfg.log_interval == 0 and self.master_process:
+                dt = time.time() - t0
+                tokens_per_sec = (
+                    cfg.batch_size * cfg.max_seq_len
+                    * cfg.gradient_accumulation_steps
+                    * self.ddp_world_size
+                    / dt
+                )
+                print(
+                    f"step {step:>6d} | "
+                    f"loss {accumulated_loss:.4f} | "
+                    f"lr {lr:.2e} | "
+                    f"grad_norm {grad_norm:.2f} | "
+                    f"tok/s {tokens_per_sec:.0f} | "
+                    f"dt {dt:.2f}s"
+                )
+                t0 = time.time()
+            # ── Evaluation ──────────────────────────────────────────
+            if step > 0 and step % cfg.eval_interval == 0 and self.master_process:
+                val_loss = self.evaluate(val_loader)
+                print(f"  ✦ Validation loss: {val_loss:.4f}")
+                if val_loss < self.best_val_loss:
+                    self.best_val_loss = val_loss
+                    self.save_checkpoint(
+                        os.path.join(cfg.output_dir, "best_model.pt")
+                    )
+                    print(f"  ✦ New best! Saved best_model.pt")
+            # ── Checkpointing ───────────────────────────────────────
+            if step > 0 and step % cfg.save_interval == 0 and self.master_process:
+                self.save_checkpoint()
+        # Final save
+        if self.master_process:
+            self.save_checkpoint(
+                os.path.join(cfg.output_dir, "final_model.pt")
+            )
+            print("\n✦ Training complete!")
+        # Cleanup DDP
+        if self.ddp:
+            dist.destroy_process_group()
+# ═══════════════════════════════════════════════════════════════════════
+#  MAIN
+# ═══════════════════════════════════════════════════════════════════════
+def main():
+    parser = argparse.ArgumentParser(description="Train GPT-300M")
+    parser.add_argument("--tiny", action="store_true", help="Use tiny config for debugging")
+    parser.add_argument("--data", type=str, default=None, help="Path to training text file")
+    parser.add_argument("--resume", type=str, default=None, help="Resume from checkpoint")
+    parser.add_argument("--d_model", type=int, default=None)
+    parser.add_argument("--n_layers", type=int, default=None)
+    parser.add_argument("--n_heads", type=int, default=None)
+    parser.add_argument("--batch_size", type=int, default=None)
+    parser.add_argument("--learning_rate", type=float, default=None)
+    parser.add_argument("--max_steps", type=int, default=None)
+    args = parser.parse_args()
+    # Config
+    config = gpt_tiny() if args.tiny else gpt_300m()
+    # Override config from CLI
+    for key in ["d_model", "n_layers", "n_heads", "batch_size", "learning_rate", "max_steps"]:
+        val = getattr(args, key, None)
+        if val is not None:
+            setattr(config, key, val)
+    # Seed
+    torch.manual_seed(config.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(config.seed)
+    # Tokenizer
+    tokenizer = BPETokenizer(vocab_size=config.vocab_size)
+    # Load data
+    if args.data and os.path.exists(args.data):
+        print(f"Loading data from {args.data}...")
+        with open(args.data, "r") as f:
+            text = f.read()
+    else:
+        # Generate synthetic data for demonstration
+        print("No data file provided. Generating synthetic training data...")
+        text = generate_synthetic_data()
+    # Train tokenizer on data
+    print("Training tokenizer...")
+    tokenizer.train(text, verbose=True)
+    tokenizer.save(os.path.join(config.output_dir, "tokenizer.json"))
+    # Create dataloaders
+    train_loader, val_loader = create_dataloaders(config, tokenizer, text=text)
+    print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")
+    # Train!
+    trainer = Trainer(config, resume_from=args.resume)
+    trainer.train(train_loader, val_loader)
+def generate_synthetic_data(n_samples: int = 10_000) -> str:
+    """Generate synthetic conversational data for demonstration."""
+    import random
+    random.seed(42)
+    greetings = ["Hello!", "Hi there!", "Hey!", "Good morning!", "Greetings!"]
+    questions = [
+        "What is machine learning?",
+        "How does gravity work?",
+        "What is the meaning of life?",
+        "Can you explain photosynthesis?",
+        "What are neural networks?",
+        "How do computers work?",
+        "What is quantum physics?",
+        "Tell me about the solar system.",
+        "How does the internet work?",
+        "What is artificial intelligence?",
+    ]
+    answers = [
+        "That's a great question! Machine learning is a subset of AI that enables systems to learn from data.",
+        "Gravity is a fundamental force that attracts objects with mass toward each other.",
+        "The meaning of life is a deeply philosophical question that has been debated for centuries.",
+        "Photosynthesis is the process by which plants convert sunlight into chemical energy.",
+        "Neural networks are computing systems inspired by biological neural networks in the brain.",
+        "Computers work by processing binary data through electronic circuits called transistors.",
+        "Quantum physics describes the behavior of matter and energy at the atomic scale.",
+        "The solar system consists of the Sun and everything that orbits around it.",
+        "The internet is a global network of interconnected computers that communicate using protocols.",
+        "Artificial intelligence is the simulation of human intelligence by computer systems.",
+    ]
+    lines = []
+    for _ in range(n_samples):
+        g = random.choice(greetings)
+        q = random.choice(questions)
+        a = random.choice(answers)
+        lines.append(f"User: {g} {q}\nAssistant: {a}\n")
+    return "\n".join(lines)
+if __name__ == "__main__":
+    main()

visual_nn_3d.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""
+GPT-300M 3D Neural Network Visualization
+==========================================
+A 3D node-and-connection neural network diagram with depth,
+perspective, and accurate parameter counts.
+"""
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from mpl_toolkits.mplot3d.art3d import Line3DCollection
+import numpy as np
+# ═══════════════════════════════════════════════════════════════════════
+#  ACCURATE GPT-300M PARAMETERS
+# ═══════════════════════════════════════════════════════════════════════
+VOCAB = 32_000
+D = 1_024
+HEADS = 16
+HEAD_D = 64
+D_FF = 4_096
+N_LAYERS = 24
+embed_p = VOCAB * D                    # 32,768,000
+attn_p = 4 * D * D                     # 4,194,304  per layer
+ffn_p = 2 * D * D_FF                   # 8,388,608  per layer
+norm_p = 2 * D                         # 2,048      per layer
+layer_p = attn_p + ffn_p + norm_p      # 12,584,960 per layer
+all_layers_p = layer_p * N_LAYERS      # 302,039,040
+final_norm_p = D                       # 1,024
+TOTAL = embed_p + all_layers_p + final_norm_p  # 334,808,064
+# Layer definitions: (name, num_display_nodes, actual_neurons, params, color_hex)
+LAYERS = [
+    ("Input Tokens",             10, VOCAB, 0,               "#4CAF50"),
+    ("Token Embedding",          12, D,     embed_p,         "#2196F3"),
+    ("RoPE Positions",           12, D,     0,               "#00BCD4"),
+    ("Layer 1: Attention QKV",   14, D,     attn_p * 3 // 4, "#FF9800"),
+    ("Layer 1: Attention Out",   12, D,     attn_p * 1 // 4, "#FF9800"),
+    ("Layer 1: FFN Up (GELU)",   16, D_FF,  ffn_p // 2,      "#8BC34A"),
+    ("Layer 1: FFN Down",        12, D,     ffn_p // 2,      "#8BC34A"),
+    ("Layers 2–23 (×22)",        14, D,     layer_p * 22,    "#9C27B0"),
+    ("Layer 24: Attention",      14, D,     attn_p,          "#FF5722"),
+    ("Layer 24: FFN",            16, D_FF,  ffn_p,           "#009688"),
+    ("Layer 24: Norm + Out",     12, D,     norm_p + final_norm_p, "#E91E63"),
+    ("LM Head (weight-tied)",    12, VOCAB, 0,               "#F44336"),
+    ("Output Probabilities",     1,  VOCAB, 0,               "#FF1744"),
+]
+def hex_to_rgb(h):
+    h = h.lstrip("#")
+    return tuple(int(h[i:i+2], 16) / 255.0 for i in (0, 2, 4))
+def generate_3d_network(save_path="neural_network_3d.png", elev=22, azim=-65):
+    """Generate a 3D neural network with nodes, connections, and parameter labels."""
+    fig = plt.figure(figsize=(28, 28), facecolor="#0a0e17")
+    ax = fig.add_subplot(111, projection="3d", computed_zorder=False)
+    # Dark theme for 3D axes
+    ax.set_facecolor("#0a0e17")
+    ax.xaxis.pane.fill = False
+    ax.yaxis.pane.fill = False
+    ax.zaxis.pane.fill = False
+    ax.xaxis.pane.set_edgecolor("#0a0e17")
+    ax.yaxis.pane.set_edgecolor("#0a0e17")
+    ax.zaxis.pane.set_edgecolor("#0a0e17")
+    ax.grid(False)
+    ax.set_axis_off()
+    ax.view_init(elev=elev, azim=azim)
+    n_layers = len(LAYERS)
+    y_positions = np.linspace(0, n_layers * 4.0, n_layers)  # depth (layer position)
+    all_positions = []  # list of (xs, ys_unused, zs, y_layer)
+    running_params = 0
+    for i, (name, n_nodes, actual, params, color_hex) in enumerate(LAYERS):
+        y = y_positions[i]
+        running_params += params
+        rgb = hex_to_rgb(color_hex)
+        # Arrange nodes in a circle/arc for 3D effect
+        if n_nodes == 1:
+            xs = np.array([0.0])
+            zs = np.array([0.0])
+        else:
+            # Spread nodes along x
+            spread = min(n_nodes * 0.5, 7.0)
+            xs = np.linspace(-spread, spread, n_nodes)
+            # Slight arc for 3D depth perception
+            zs = -0.1 * (xs ** 2)
+        ys = np.full_like(xs, y)
+        all_positions.append((xs, ys, zs))
+        # ── Draw connections to previous layer ──────────────────
+        if i > 0:
+            prev_xs, prev_ys, prev_zs = all_positions[i - 1]
+            # Sample connections to avoid clutter
+            n_prev = len(prev_xs)
+            n_curr = len(xs)
+            step_p = max(1, n_prev // 8)
+            step_c = max(1, n_curr // 8)
+            lines = []
+            colors_lines = []
+            for pi in range(0, n_prev, step_p):
+                for ci in range(0, n_curr, step_c):
+                    lines.append([
+                        (prev_xs[pi], prev_ys[pi], prev_zs[pi]),
+                        (xs[ci], ys[ci], zs[ci]),
+                    ])
+                    colors_lines.append((*rgb, 0.18))
+            if lines:
+                lc = Line3DCollection(lines, colors=colors_lines, linewidths=0.7)
+                ax.add_collection3d(lc)
+        # ── Draw nodes (spheres) ────────────────────────────────
+        node_size = 200 if n_nodes > 12 else 280
+        if n_nodes == 1:
+            node_size = 600
+        ax.scatter(
+            xs, ys, zs,
+            c=[color_hex], s=node_size,
+            alpha=0.95, edgecolors="white", linewidths=0.5,
+            depthshade=True, zorder=5,
+        )
+        # ── Glow effect (larger transparent scatter behind) ─────
+        ax.scatter(
+            xs, ys, zs,
+            c=[color_hex], s=node_size * 3,
+            alpha=0.08, edgecolors="none",
+            depthshade=True, zorder=4,
+        )
+        # ── Labels ──────────────────────────────────────────────
+        label_x = xs[-1] + 1.8 if n_nodes > 1 else 2.0
+        ax.text(
+            label_x, y, 0,
+            name,
+            fontsize=9.5, fontweight="bold",
+            color="#E6EDF3", fontfamily="monospace",
+            zorder=10,
+        )
+        # Param count
+        if params > 0:
+            if params >= 1_000_000:
+                ptxt = f"{params/1e6:.1f}M params"
+            else:
+                ptxt = f"{params:,} params"
+            ax.text(
+                label_x, y, -1.0,
+                ptxt,
+                fontsize=8, color=color_hex,
+                fontfamily="monospace", fontweight="bold",
+                zorder=10,
+            )
+        # Running total
+        if running_params > 0:
+            ax.text(
+                label_x, y, -1.8,
+                f"Σ {running_params/1e6:.1f}M",
+                fontsize=6, color="#8B949E",
+                fontfamily="monospace",
+                zorder=10,
+            )
+        # Overflow indicator
+        if actual > n_nodes and n_nodes > 1:
+            ax.text(
+                xs[-1] + 0.5, y, zs[-1],
+                f"(+{actual - n_nodes:,})",
+                fontsize=6, color="#8B949E",
+                fontfamily="monospace",
+                zorder=10,
+            )
+    # ── Title ──────────────────────────────────────────────────────
+    ax.text2D(
+        0.5, 0.96,
+        "GPT-300M  •  3D Neural Network Architecture",
+        transform=fig.transFigure,
+        fontsize=22, fontweight="bold", color="#E6EDF3",
+        ha="center", fontfamily="monospace",
+    )
+    ax.text2D(
+        0.5, 0.94,
+        f"{TOTAL:,} parameters  |  {N_LAYERS} layers  |  {HEADS} heads  |  d_model={D}  |  d_ff={D_FF}",
+        transform=fig.transFigure,
+        fontsize=10, color="#8B949E",
+        ha="center", fontfamily="monospace",
+    )
+    # ── Parameter summary ──────────────────────────────────────────
+    summary = (
+        f"Parameter Breakdown:\n"
+        f"  Embedding:     {embed_p/1e6:>7.1f}M  ({embed_p/TOTAL*100:.1f}%)\n"
+        f"  Attention ×24: {attn_p*N_LAYERS/1e6:>7.1f}M  ({attn_p*N_LAYERS/TOTAL*100:.1f}%)\n"
+        f"  FFN ×24:       {ffn_p*N_LAYERS/1e6:>7.1f}M  ({ffn_p*N_LAYERS/TOTAL*100:.1f}%)\n"
+        f"  Norms:         {(norm_p*N_LAYERS+final_norm_p)/1e6:>7.3f}M  ({(norm_p*N_LAYERS+final_norm_p)/TOTAL*100:.1f}%)\n"
+        f"  LM Head:       tied (0 extra)\n"
+        f"  ───────────────────────\n"
+        f"  TOTAL:         {TOTAL/1e6:>7.1f}M"
+    )
+    ax.text2D(
+        0.02, 0.06, summary,
+        transform=fig.transFigure,
+        fontsize=8, color="#58A6FF",
+        fontfamily="monospace", verticalalignment="bottom",
+        bbox=dict(boxstyle="round,pad=0.6", facecolor="#161B22",
+                  edgecolor="#30363D", linewidth=1),
+    )
+    # ── Legend ──────────────────────────────────────────────────────
+    legend_items = [
+        ("#4CAF50", "Input"), ("#2196F3", "Embeddings"), ("#FF9800", "Attention"),
+        ("#8BC34A", "FFN"), ("#9C27B0", "×22 Layers"), ("#E91E63", "Norm"),
+        ("#F44336", "Output"),
+    ]
+    for j, (c, l) in enumerate(legend_items):
+        ax.text2D(
+            0.92, 0.30 - j * 0.025, f"● {l}",
+            transform=fig.transFigure,
+            fontsize=8, color=c, fontfamily="monospace",
+        )
+    # Set axis limits
+    all_x = np.concatenate([p[0] for p in all_positions])
+    all_y = np.concatenate([p[1] for p in all_positions])
+    all_z = np.concatenate([p[2] for p in all_positions])
+    margin = 4
+    ax.set_xlim(all_x.min() - margin, all_x.max() + margin + 8)
+    ax.set_ylim(all_y.min() - margin, all_y.max() + margin)
+    ax.set_zlim(all_z.min() - margin, all_z.max() + margin)
+    plt.savefig(save_path, dpi=200, bbox_inches="tight",
+                facecolor="#0a0e17", edgecolor="none")
+    print(f"Saved: {save_path}")
+    plt.close()
+def generate_3d_single_layer(save_path="layer_3d.png", elev=18, azim=-55):
+    """3D view of a single transformer layer internals."""
+    fig = plt.figure(figsize=(22, 18), facecolor="#0a0e17")
+    ax = fig.add_subplot(111, projection="3d", computed_zorder=False)
+    ax.set_facecolor("#0a0e17")
+    ax.xaxis.pane.fill = False
+    ax.yaxis.pane.fill = False
+    ax.zaxis.pane.fill = False
+    ax.xaxis.pane.set_edgecolor("#0a0e17")
+    ax.yaxis.pane.set_edgecolor("#0a0e17")
+    ax.zaxis.pane.set_edgecolor("#0a0e17")
+    ax.grid(False)
+    ax.set_axis_off()
+    ax.view_init(elev=elev, azim=azim)
+    sub_layers = [
+        ("Input (d=1024)",         10, D,    0,         "#2196F3"),
+        ("Query (d=1024)",         10, D,    D*D,       "#FF6B6B"),
+        ("Key (d=1024)",           10, D,    D*D,       "#4ECDC4"),
+        ("Value (d=1024)",         10, D,    D*D,       "#45B7D1"),
+        ("16 Attention Heads",     16, D,    0,         "#FF9800"),
+        ("Attn Output (d=1024)",   10, D,    D*D,       "#FFA726"),
+        ("⊕ Residual + RMSNorm",  10, D,    D,         "#E91E63"),
+        ("FFN Up → GELU (d=4096)", 16, D_FF, D*D_FF,   "#8BC34A"),
+        ("FFN Down (d=1024)",      10, D,    D_FF*D,    "#7CB342"),
+        ("⊕ Residual + RMSNorm",  10, D,    D,         "#E91E63"),
+        ("Layer Output (d=1024)",  10, D,    0,         "#2196F3"),
+    ]
+    n = len(sub_layers)
+    y_positions = np.linspace(0, n * 3, n)
+    all_pos = []
+    for i, (name, n_nodes, actual, params, chex) in enumerate(sub_layers):
+        y = y_positions[i]
+        rgb = hex_to_rgb(chex)
+        spread = min(n_nodes * 0.45, 5.5)
+        xs = np.linspace(-spread, spread, n_nodes)
+        zs = -0.12 * (xs ** 2)
+        ys = np.full_like(xs, y)
+        all_pos.append((xs, ys, zs))
+        # Connections
+        if i > 0:
+            pxs, pys, pzs = all_pos[i - 1]
+            sp = max(1, len(pxs) // 8)
+            sc = max(1, len(xs) // 8)
+            lines = []
+            cols = []
+            for pi in range(0, len(pxs), sp):
+                for ci in range(0, len(xs), sc):
+                    lines.append([(pxs[pi], pys[pi], pzs[pi]), (xs[ci], ys[ci], zs[ci])])
+                    cols.append((*rgb, 0.15))
+            if lines:
+                ax.add_collection3d(Line3DCollection(lines, colors=cols, linewidths=0.6))
+        # Nodes
+        sz = 130 if n_nodes > 12 else 180
+        ax.scatter(xs, ys, zs, c=[chex], s=sz, alpha=0.95,
+                   edgecolors="white", linewidths=0.5, depthshade=True, zorder=5)
+        ax.scatter(xs, ys, zs, c=[chex], s=sz * 3, alpha=0.07,
+                   edgecolors="none", depthshade=True, zorder=4)
+        # Labels
+        lx = xs[-1] + 1.0
+        ax.text(lx, y, 0, name, fontsize=9, fontweight="bold",
+                color="#E6EDF3", fontfamily="monospace", zorder=10)
+        if params > 0:
+            ax.text(lx, y, -0.8, f"{params:,} params",
+                    fontsize=7, color=chex, fontfamily="monospace",
+                    fontweight="bold", zorder=10)
+        if actual > n_nodes:
+            ax.text(xs[-1] + 0.4, y, zs[-1], f"(+{actual-n_nodes:,})",
+                    fontsize=6, color="#8B949E", fontfamily="monospace", zorder=10)
+    ax.text2D(0.5, 0.96, "Single Transformer Layer — 3D View",
+              transform=fig.transFigure, fontsize=20, fontweight="bold",
+              color="#E6EDF3", ha="center", fontfamily="monospace")
+    ax.text2D(0.5, 0.935,
+              f"12,584,960 params/layer × 24 layers = 302,039,040 total",
+              transform=fig.transFigure, fontsize=10, color="#8B949E",
+              ha="center", fontfamily="monospace")
+    all_x = np.concatenate([p[0] for p in all_pos])
+    all_y = np.concatenate([p[1] for p in all_pos])
+    all_z = np.concatenate([p[2] for p in all_pos])
+    ax.set_xlim(all_x.min() - 2, all_x.max() + 8)
+    ax.set_ylim(all_y.min() - 2, all_y.max() + 2)
+    ax.set_zlim(all_z.min() - 2, all_z.max() + 2)
+    plt.savefig(save_path, dpi=200, bbox_inches="tight",
+                facecolor="#0a0e17", edgecolor="none")
+    print(f"Saved: {save_path}")
+    plt.close()
+def generate_3d_rotating_views(base_path="viz"):
+    """Generate multiple angle views."""
+    import os
+    os.makedirs(base_path, exist_ok=True)
+    # Main dramatic angle — more front-facing
+    generate_3d_network(f"{base_path}/nn_3d_main.png", elev=12, azim=-15)
+    # Angled view
+    generate_3d_network(f"{base_path}/nn_3d_top.png", elev=35, azim=-25)
+    # Side angle
+    generate_3d_network(f"{base_path}/nn_3d_side.png", elev=8, azim=-45)
+    # Single layer detail
+    generate_3d_single_layer(f"{base_path}/nn_3d_layer.png", elev=18, azim=-55)
+if __name__ == "__main__":
+    import os
+    os.makedirs("viz", exist_ok=True)
+    print("=" * 55)
+    print("  GPT-300M  •  3D Visualization Generator")
+    print("=" * 55)
+    print(f"  Total parameters: {TOTAL:,}")
+    print(f"  Per layer:        {layer_p:,}")
+    print(f"  Layers:           {N_LAYERS}")
+    print("=" * 55)
+    generate_3d_rotating_views("viz")
+    print("\nAll 3D views generated!")

visual_nn_nodes.py ADDED Viewed

	@@ -0,0 +1,395 @@

+"""
+GPT-300M Visual Neural Network — Node & Connection Style
+==========================================================
+Generates a classic neural network diagram (like the user's reference)
+with nodes and connection lines, accurately showing the GPT-300M architecture
+with correct parameter calculations at each layer.
+"""
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+# ═══════════════════════════════════════════════════════════════════════
+#  GPT-300M ARCHITECTURE — ACCURATE PARAMETER COUNTS
+# ═══════════════════════════════════════════════════════════════════════
+# All layer definitions with EXACT parameter counts
+# Format: (layer_name, display_nodes, actual_neurons, params_in_layer, color)
+VOCAB_SIZE = 32_000
+D_MODEL = 1_024
+N_HEADS = 16
+HEAD_DIM = 64
+D_FF = 4_096
+N_LAYERS = 24
+# Parameter calculations per component:
+embed_params = VOCAB_SIZE * D_MODEL                          # 32,768,000
+# RoPE has no learned parameters (precomputed sin/cos)
+rope_params = 0
+# Per transformer layer:
+qkv_params = 3 * D_MODEL * D_MODEL                          # 3,145,728 (Q, K, V projections)
+out_proj_params = D_MODEL * D_MODEL                          # 1,048,576 (output projection)
+attn_total = qkv_params + out_proj_params                    # 4,194,304
+ffn_up_params = D_MODEL * D_FF                               # 4,194,304 (up projection)
+ffn_down_params = D_FF * D_MODEL                             # 4,194,304 (down projection)
+ffn_total = ffn_up_params + ffn_down_params                  # 8,388,608
+rmsnorm_params = D_MODEL * 2                                 # 2,048 (2 norms per layer)
+layer_total = attn_total + ffn_total + rmsnorm_params        # 12,584,960
+all_layers_total = layer_total * N_LAYERS                    # 302,039,040
+final_norm_params = D_MODEL                                  # 1,024
+# LM Head is weight-tied with embedding, so 0 extra params
+lm_head_params = 0  # (tied)
+TOTAL_PARAMS = embed_params + all_layers_total + final_norm_params + lm_head_params
+# = 32,768,000 + 302,039,040 + 1,024 = 334,808,064
+# With weight tying, unique params ≈ 334,808,064
+# ═══════════════════════════════════════════════════════════════════════
+#  LAYER DEFINITIONS FOR VISUALIZATION
+# ═══════════════════════════════════════════════════════════════════════
+# (name, nodes_to_display, actual_size, params_to_this_layer, color)
+LAYERS = [
+    ("Input Tokens",           10,  VOCAB_SIZE, 0,                   "#4CAF50"),   # Green
+    ("Token Embedding",        10,  D_MODEL,    embed_params,        "#2196F3"),   # Blue
+    ("RoPE Positions",         10,  D_MODEL,    0,                   "#00BCD4"),   # Cyan
+    # Show 3 representative transformer layers (of 24)
+    ("Layer 1: Attention Q,K,V", 12, D_MODEL,   qkv_params,         "#FF9800"),   # Orange
+    ("Layer 1: Attention Out",   10, D_MODEL,    out_proj_params,    "#FF9800"),
+    ("Layer 1: FFN Up",          14, D_FF,       ffn_up_params,      "#8BC34A"),   # Light green
+    ("Layer 1: FFN Down",        10, D_MODEL,    ffn_down_params,    "#8BC34A"),
+    ("Layer 2–23: ×22 Blocks",  12, D_MODEL,    layer_total * 22,   "#9C27B0"),   # Purple
+    ("Layer 24: Attention",     12,  D_MODEL,    attn_total,         "#FF5722"),   # Deep orange
+    ("Layer 24: FFN",           14,  D_FF,       ffn_total,          "#009688"),   # Teal
+    ("Layer 24: Output",        10,  D_MODEL,    rmsnorm_params,     "#009688"),
+    ("Final RMSNorm",          10,  D_MODEL,    final_norm_params,   "#E91E63"),   # Pink
+    ("LM Head (tied)",         10,  VOCAB_SIZE, lm_head_params,      "#F44336"),   # Red
+    ("Output Probabilities",    1,  VOCAB_SIZE, 0,                   "#F44336"),   # Red
+]
+def draw_neural_network(save_path="neural_network.png"):
+    fig, ax = plt.subplots(figsize=(22, 30), facecolor="#0D1117")
+    ax.set_facecolor("#0D1117")
+    n_layers = len(LAYERS)
+    y_positions = np.linspace(0.92, 0.04, n_layers)
+    # Spacing
+    x_center = 0.5
+    max_spread = 0.38
+    all_node_positions = []  # Store (x_list, y) for connections
+    running_params = 0
+    for i, (name, n_display, actual_size, params, color) in enumerate(LAYERS):
+        y = y_positions[i]
+        running_params += params
+        # Calculate x positions for nodes
+        if n_display == 1:
+            xs = [x_center]
+        else:
+            xs = np.linspace(x_center - max_spread, x_center + max_spread, n_display)
+        all_node_positions.append((xs, y))
+        # Draw connections to previous layer
+        if i > 0:
+            prev_xs, prev_y = all_node_positions[i - 1]
+            # Limit connections for readability
+            max_connections = 200
+            step_curr = max(1, len(xs) // 12)
+            step_prev = max(1, len(prev_xs) // 12)
+            conn_count = 0
+            for px in prev_xs[::step_prev]:
+                for cx in xs[::step_curr]:
+                    if conn_count > max_connections:
+                        break
+                    ax.plot(
+                        [px, cx], [prev_y, y],
+                        color=color, alpha=0.22, linewidth=0.6,
+                        transform=ax.transAxes, zorder=1,
+                    )
+                    conn_count += 1
+        # Draw nodes
+        node_radius = 0.01 if n_display <= 12 else 0.008
+        if n_display == 1:
+            node_radius = 0.016
+        for x in xs:
+            circle = plt.Circle(
+                (x, y), node_radius,
+                facecolor=color, edgecolor="white",
+                linewidth=0.6, alpha=0.95,
+                transform=ax.transAxes, zorder=3,
+            )
+            ax.add_patch(circle)
+        # Draw "+N" indicator if actual size > displayed
+        if actual_size > n_display and n_display > 1:
+            extra = actual_size - n_display
+            if extra > 0:
+                ax.text(
+                    xs[-1] + 0.03, y,
+                    f"(+{extra:,})",
+                    transform=ax.transAxes,
+                    fontsize=7, color="#8B949E",
+                    ha="left", va="center",
+                    fontfamily="monospace",
+                )
+        # Layer label (left side)
+        ax.text(
+            0.02, y,
+            name,
+            transform=ax.transAxes,
+            fontsize=9, fontweight="bold",
+            color="#E6EDF3",
+            ha="left", va="center",
+            fontfamily="monospace",
+        )
+        # Parameter count (right side)
+        if params > 0:
+            param_text = f"{params:,} params"
+            ax.text(
+                0.98, y,
+                param_text,
+                transform=ax.transAxes,
+                fontsize=8,
+                color=color,
+                ha="right", va="center",
+                fontfamily="monospace",
+                fontweight="bold",
+            )
+        # Running total (far right, smaller)
+        if running_params > 0:
+            ax.text(
+                0.98, y - 0.012,
+                f"Σ {running_params / 1e6:.1f}M",
+                transform=ax.transAxes,
+                fontsize=6.5,
+                color="#8B949E",
+                ha="right", va="center",
+                fontfamily="monospace",
+            )
+    # ── Title ──────────────────────────────────────────────────────
+    ax.text(
+        0.5, 0.97,
+        "GPT-300M Neural Network",
+        transform=ax.transAxes,
+        fontsize=24, fontweight="bold",
+        color="#E6EDF3", ha="center", va="center",
+        fontfamily="monospace",
+    )
+    ax.text(
+        0.5, 0.955,
+        f"Total: {TOTAL_PARAMS:,} parameters  •  {N_LAYERS} transformer layers  •  "
+        f"{N_HEADS} attention heads  •  d_model={D_MODEL}",
+        transform=ax.transAxes,
+        fontsize=9, color="#8B949E", ha="center", va="center",
+        fontfamily="monospace",
+    )
+    # ── Parameter Summary Box ──────────────────────────────────────
+    summary_y = 0.005
+    summary_text = (
+        f"┌─────────────── Parameter Summary ───────────────┐\n"
+        f"│  Token Embedding:    {embed_params:>13,}  ({embed_params/TOTAL_PARAMS*100:4.1f}%)  │\n"
+        f"│  Attention (×{N_LAYERS}):    {attn_total*N_LAYERS:>13,}  ({attn_total*N_LAYERS/TOTAL_PARAMS*100:4.1f}%)  │\n"
+        f"│  Feed-Forward (×{N_LAYERS}): {ffn_total*N_LAYERS:>13,}  ({ffn_total*N_LAYERS/TOTAL_PARAMS*100:4.1f}%)  │\n"
+        f"│  RMSNorm (×{N_LAYERS}+1):    {rmsnorm_params*N_LAYERS+final_norm_params:>13,}  ({(rmsnorm_params*N_LAYERS+final_norm_params)/TOTAL_PARAMS*100:4.1f}%)  │\n"
+        f"│  LM Head (tied):     {'0 (shared)':>13}          │\n"
+        f"├─────────────────────────────────────────────────┤\n"
+        f"│  TOTAL:              {TOTAL_PARAMS:>13,}  (100%)   │\n"
+        f"└─────────────────────────────────────────────────┘"
+    )
+    ax.text(
+        0.5, summary_y,
+        summary_text,
+        transform=ax.transAxes,
+        fontsize=8, color="#58A6FF",
+        ha="center", va="bottom",
+        fontfamily="monospace",
+        bbox=dict(boxstyle="round,pad=0.8", facecolor="#161B22",
+                  edgecolor="#30363D", linewidth=1),
+    )
+    # ── Legend ──────────────────────────────────────────────────────
+    legend_items = [
+        ("#4CAF50", "Input / Tokenization"),
+        ("#2196F3", "Embeddings"),
+        ("#FF9800", "Self-Attention"),
+        ("#8BC34A", "Feed-Forward (GELU)"),
+        ("#9C27B0", "Collapsed Layers (×22)"),
+        ("#E91E63", "Normalization"),
+        ("#F44336", "Output / LM Head"),
+    ]
+    for j, (c, label) in enumerate(legend_items):
+        lx = 0.02
+        ly = 0.035 - j * 0.015
+        circle = plt.Circle(
+            (lx, ly), 0.004,
+            facecolor=c, edgecolor="white", linewidth=0.3,
+            transform=ax.transAxes, zorder=5,
+        )
+        ax.add_patch(circle)
+        ax.text(
+            lx + 0.012, ly, label,
+            transform=ax.transAxes,
+            fontsize=7, color="#C9D1D9", va="center",
+            fontfamily="monospace",
+        )
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.axis("off")
+    plt.savefig(save_path, dpi=200, bbox_inches="tight",
+                facecolor="#0D1117", edgecolor="none")
+    print(f"Saved: {save_path}")
+    plt.close()
+# ═══════════════════════════════════════════════════════════════════════
+#  ALSO: A cleaner "zoomed in" single-layer view
+# ═══════════════════════════════════════════════════════════════════════
+def draw_single_layer_detail(save_path="layer_detail.png"):
+    """Draw a detailed view of one transformer layer with node connections."""
+    fig, ax = plt.subplots(figsize=(20, 14), facecolor="#0D1117")
+    ax.set_facecolor("#0D1117")
+    # One transformer layer breakdown:
+    # Input (1024) → Q,K,V (3×1024) → Attention Heads (16×64) → Output Proj (1024)
+    # → RMSNorm (1024) → FFN Up (4096) → GELU → FFN Down (1024) → Output (1024)
+    sub_layers = [
+        ("Input\n(d=1,024)",        8,   D_MODEL, 0,              "#2196F3"),
+        ("Query\n(d=1,024)",        8,   D_MODEL, D_MODEL**2,     "#FF6B6B"),
+        ("Key\n(d=1,024)",          8,   D_MODEL, D_MODEL**2,     "#4ECDC4"),
+        ("Value\n(d=1,024)",        8,   D_MODEL, D_MODEL**2,     "#45B7D1"),
+        ("Attention Heads\n(16×64)", 16,  D_MODEL, 0,              "#FF9800"),
+        ("Attn Output\n(d=1,024)",  8,   D_MODEL, D_MODEL**2,     "#FF9800"),
+        ("⊕ Residual + Norm",      8,   D_MODEL, D_MODEL,         "#E91E63"),
+        ("FFN Up (GELU)\n(d=4,096)", 14, D_FF,    D_MODEL*D_FF,   "#8BC34A"),
+        ("FFN Down\n(d=1,024)",     8,   D_MODEL, D_FF*D_MODEL,   "#8BC34A"),
+        ("⊕ Residual + Norm",      8,   D_MODEL, D_MODEL,         "#E91E63"),
+        ("Layer Output\n(d=1,024)", 8,   D_MODEL, 0,              "#2196F3"),
+    ]
+    n = len(sub_layers)
+    y_positions = np.linspace(0.9, 0.08, n)
+    x_center = 0.5
+    max_spread = 0.32
+    all_pos = []
+    for i, (name, n_nodes, actual, params, color) in enumerate(sub_layers):
+        y = y_positions[i]
+        xs = np.linspace(x_center - max_spread, x_center + max_spread, n_nodes)
+        all_pos.append((xs, y))
+        # Connections
+        if i > 0:
+            prev_xs, prev_y = all_pos[i-1]
+            step_c = max(1, len(xs) // 10)
+            step_p = max(1, len(prev_xs) // 10)
+            for px in prev_xs[::step_p]:
+                for cx in xs[::step_c]:
+                    ax.plot([px, cx], [prev_y, y],
+                            color=color, alpha=0.2, linewidth=0.7,
+                            transform=ax.transAxes, zorder=1)
+        # Nodes
+        r = 0.011 if n_nodes <= 10 else 0.009
+        for x in xs:
+            c = plt.Circle((x, y), r, facecolor=color, edgecolor="white",
+                           linewidth=0.6, alpha=0.95,
+                           transform=ax.transAxes, zorder=3)
+            ax.add_patch(c)
+        # Overflow indicator
+        if actual > n_nodes:
+            ax.text(xs[-1] + 0.025, y, f"(+{actual - n_nodes:,})",
+                    transform=ax.transAxes, fontsize=7, color="#8B949E",
+                    ha="left", va="center", fontfamily="monospace")
+        # Label
+        ax.text(0.03, y, name, transform=ax.transAxes,
+                fontsize=9, fontweight="bold", color="#E6EDF3",
+                ha="left", va="center", fontfamily="monospace")
+        # Params
+        if params > 0:
+            ax.text(0.97, y, f"{params:,}", transform=ax.transAxes,
+                    fontsize=8, color=color, ha="right", va="center",
+                    fontfamily="monospace", fontweight="bold")
+    # Title
+    ax.text(0.5, 0.96, "Single Transformer Layer — Detailed View",
+            transform=ax.transAxes, fontsize=18, fontweight="bold",
+            color="#E6EDF3", ha="center", fontfamily="monospace")
+    ax.text(0.5, 0.935,
+            f"Parameters per layer: {layer_total:,}  •  ×{N_LAYERS} layers = {all_layers_total:,} total",
+            transform=ax.transAxes, fontsize=9, color="#8B949E",
+            ha="center", fontfamily="monospace")
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.axis("off")
+    plt.savefig(save_path, dpi=200, bbox_inches="tight",
+                facecolor="#0D1117", edgecolor="none")
+    print(f"Saved: {save_path}")
+    plt.close()
+if __name__ == "__main__":
+    import os
+    os.makedirs("viz", exist_ok=True)
+    print("=" * 50)
+    print("  GPT-300M Parameter Verification")
+    print("=" * 50)
+    print(f"  Token Embedding:      {embed_params:>13,}")
+    print(f"  Per-layer Attention:  {attn_total:>13,}")
+    print(f"  Per-layer FFN:        {ffn_total:>13,}")
+    print(f"  Per-layer Norm:       {rmsnorm_params:>13,}")
+    print(f"  Per-layer Total:      {layer_total:>13,}")
+    print(f"  All {N_LAYERS} layers:         {all_layers_total:>13,}")
+    print(f"  Final Norm:           {final_norm_params:>13,}")
+    print(f"  LM Head (tied):       {'0 (shared)':>13}")
+    print(f"  ─────────────────────────────────")
+    print(f"  TOTAL:                {TOTAL_PARAMS:>13,}")
+    print(f"  ≈ {TOTAL_PARAMS / 1e6:.1f}M parameters")
+    print("=" * 50)
+    print("\nGenerating full network diagram...")
+    draw_neural_network("viz/neural_network_full.png")
+    print("Generating single-layer detail...")
+    draw_single_layer_detail("viz/neural_network_layer.png")
+    print("\nDone!")

visualize_nn.py ADDED Viewed

	@@ -0,0 +1,472 @@

+"""
+GPT-300M Neural Network Visualizer
+====================================
+Generates detailed architectural diagrams of the GPT-300M model
+using matplotlib, showing:
+  - Full model architecture flow
+  - Detailed transformer block internals
+  - Attention head visualization
+  - Parameter distribution charts
+Usage:
+  python visualize_nn.py
+  python visualize_nn.py --output architecture.png
+"""
+import argparse
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
+import numpy as np
+from config import GPT300MConfig, gpt_300m
+# ═══════════════════════════════════════════════════════════════════════
+#  COLOR SCHEME
+# ═══════════════════════════════════════════════════════════════════════
+COLORS = {
+    "bg":           "#0D1117",
+    "text":         "#E6EDF3",
+    "text_dim":     "#8B949E",
+    "embed":        "#58A6FF",    # Blue
+    "attn":         "#F78166",    # Orange
+    "ffn":          "#7EE787",    # Green
+    "norm":         "#D2A8FF",    # Purple
+    "residual":     "#FFA657",    # Yellow-orange
+    "output":       "#FF7B72",    # Red
+    "arrow":        "#484F58",
+    "highlight":    "#1F6FEB",
+    "border":       "#30363D",
+    "card_bg":      "#161B22",
+    "accent1":      "#79C0FF",
+    "accent2":      "#BB9AF7",
+}
+def draw_rounded_box(ax, x, y, w, h, color, label, fontsize=10,
+                      text_color=None, alpha=0.9, sublabel=None):
+    """Draw a rounded rectangle with label."""
+    box = FancyBboxPatch(
+        (x - w/2, y - h/2), w, h,
+        boxstyle="round,pad=0.1",
+        facecolor=color,
+        edgecolor="white",
+        linewidth=0.5,
+        alpha=alpha,
+        zorder=3,
+    )
+    ax.add_patch(box)
+    ax.text(
+        x, y + (0.15 if sublabel else 0),
+        label,
+        ha="center", va="center",
+        fontsize=fontsize,
+        fontweight="bold",
+        color=text_color or COLORS["text"],
+        zorder=4,
+    )
+    if sublabel:
+        ax.text(
+            x, y - 0.25,
+            sublabel,
+            ha="center", va="center",
+            fontsize=fontsize - 2,
+            color=COLORS["text_dim"],
+            zorder=4,
+        )
+def draw_arrow(ax, x1, y1, x2, y2, color=None):
+    """Draw an arrow between two points."""
+    ax.annotate(
+        "",
+        xy=(x2, y2), xytext=(x1, y1),
+        arrowprops=dict(
+            arrowstyle="->",
+            color=color or COLORS["arrow"],
+            lw=1.5,
+            connectionstyle="arc3,rad=0",
+        ),
+        zorder=2,
+    )
+def draw_residual_connection(ax, x_start, y_start, x_end, y_end, offset=1.8):
+    """Draw a residual/skip connection arc."""
+    ax.annotate(
+        "",
+        xy=(x_end, y_end), xytext=(x_start, y_start),
+        arrowprops=dict(
+            arrowstyle="->",
+            color=COLORS["residual"],
+            lw=1.2,
+            linestyle="--",
+            connectionstyle=f"arc3,rad=0.3",
+        ),
+        zorder=1,
+    )
+# ═══════════════════════════════════════════════════════════════════════
+#  FULL ARCHITECTURE DIAGRAM
+# ═══════════════════════════════════════════════════════════════════════
+def draw_full_architecture(config: GPT300MConfig, save_path: str = None):
+    """Draw the complete GPT-300M architecture."""
+    fig, ax = plt.subplots(1, 1, figsize=(14, 24), facecolor=COLORS["bg"])
+    ax.set_facecolor(COLORS["bg"])
+    ax.set_xlim(-4, 4)
+    ax.set_ylim(-1, 22)
+    ax.axis("off")
+    # Title
+    ax.text(0, 21.5, "GPT-300M Architecture", ha="center", va="center",
+            fontsize=22, fontweight="bold", color=COLORS["text"],
+            fontfamily="monospace")
+    ax.text(0, 21.0,
+            f"{config.total_params_estimate:,} parameters  •  "
+            f"{config.n_layers} layers  •  "
+            f"{config.n_heads} heads  •  "
+            f"d={config.d_model}",
+            ha="center", va="center", fontsize=10, color=COLORS["text_dim"],
+            fontfamily="monospace")
+    y = 19.5  # Starting y position
+    # ── Input ──────────────────────────────────────────────────────
+    draw_rounded_box(ax, 0, y, 3.5, 0.7, COLORS["card_bg"], "Input Token IDs",
+                      sublabel=f"[batch, seq_len]", fontsize=11)
+    y -= 1.1
+    draw_arrow(ax, 0, y + 0.8, 0, y + 0.4)
+    # ── Token Embedding ────────────────────────────────────────────
+    draw_rounded_box(ax, 0, y, 3.5, 0.7, COLORS["embed"],
+                      "Token Embedding", text_color="#000",
+                      sublabel=f"{config.vocab_size:,} × {config.d_model}")
+    y -= 1.1
+    draw_arrow(ax, 0, y + 0.8, 0, y + 0.4)
+    # ── RoPE ───────────────────────────────────────────────────────
+    draw_rounded_box(ax, 0, y, 3.5, 0.6, COLORS["accent2"],
+                      "Rotary Position Embeddings (RoPE)",
+                      text_color="#000", fontsize=9,
+                      sublabel=f"θ = {config.rope_theta:.0f}")
+    y -= 1.0
+    draw_arrow(ax, 0, y + 0.7, 0, y + 0.4)
+    # ── Dropout ────────────────────────────────────────────────────
+    draw_rounded_box(ax, 0, y, 2.5, 0.5, COLORS["border"],
+                      f"Dropout (p={config.dropout})", fontsize=9)
+    y -= 1.0
+    draw_arrow(ax, 0, y + 0.7, 0, y + 0.35)
+    # ── Transformer Blocks ─────────────────────────────────────────
+    block_height = 3.2
+    # Draw detailed first block
+    block_y_start = y
+    block_y_end = y - block_height
+    # Block container
+    block_box = FancyBboxPatch(
+        (-3.3, block_y_end - 0.1), 6.6, block_height + 0.2,
+        boxstyle="round,pad=0.15",
+        facecolor=COLORS["card_bg"],
+        edgecolor=COLORS["highlight"],
+        linewidth=1.5,
+        alpha=0.8,
+        zorder=1,
+    )
+    ax.add_patch(block_box)
+    ax.text(-3.0, block_y_start + 0.05,
+            f"Transformer Block × {config.n_layers}",
+            fontsize=10, fontweight="bold", color=COLORS["highlight"],
+            fontfamily="monospace", zorder=5)
+    # Inside the block
+    by = block_y_start - 0.4
+    # RMSNorm 1
+    draw_rounded_box(ax, 0, by, 2.8, 0.45, COLORS["norm"],
+                      "RMSNorm", text_color="#000", fontsize=9)
+    by -= 0.7
+    draw_arrow(ax, 0, by + 0.5, 0, by + 0.25)
+    # Multi-Head Attention
+    draw_rounded_box(ax, 0, by, 2.8, 0.7, COLORS["attn"],
+                      "Multi-Head Attention", text_color="#000", fontsize=10,
+                      sublabel=f"{config.n_heads} heads × {config.head_dim}d")
+    # Residual connection
+    draw_residual_connection(ax, -1.6, block_y_start - 0.2, -1.6, by)
+    ax.text(-2.5, by + 0.3, "⊕ residual", fontsize=7,
+            color=COLORS["residual"], ha="center")
+    by -= 0.8
+    draw_arrow(ax, 0, by + 0.5, 0, by + 0.25)
+    # RMSNorm 2
+    draw_rounded_box(ax, 0, by, 2.8, 0.45, COLORS["norm"],
+                      "RMSNorm", text_color="#000", fontsize=9)
+    by -= 0.7
+    draw_arrow(ax, 0, by + 0.5, 0, by + 0.25)
+    # Feed-Forward Network
+    draw_rounded_box(ax, 0, by, 2.8, 0.7, COLORS["ffn"],
+                      "Feed-Forward Network", text_color="#000", fontsize=10,
+                      sublabel=f"{config.d_model} → {config.d_ff} → {config.d_model}")
+    # Residual connection
+    draw_residual_connection(ax, 1.6, by + 1.5, 1.6, by)
+    ax.text(2.5, by + 0.7, "⊕ residual", fontsize=7,
+            color=COLORS["residual"], ha="center")
+    y = block_y_end - 0.4
+    # ── Repeated blocks indicator ──────────────────────────────────
+    draw_arrow(ax, 0, y + 0.2, 0, y - 0.1)
+    ax.text(0, y - 0.3, f"× {config.n_layers} layers", ha="center",
+            fontsize=11, fontweight="bold", color=COLORS["text_dim"],
+            fontfamily="monospace",
+            bbox=dict(boxstyle="round,pad=0.3", facecolor=COLORS["card_bg"],
+                      edgecolor=COLORS["border"]))
+    y -= 0.9
+    draw_arrow(ax, 0, y + 0.3, 0, y + 0.05)
+    # ── Final RMSNorm ──────────────────────────────────────────────
+    draw_rounded_box(ax, 0, y - 0.2, 3.5, 0.5, COLORS["norm"],
+                      "Final RMSNorm", text_color="#000", fontsize=10)
+    y -= 1.0
+    draw_arrow(ax, 0, y + 0.5, 0, y + 0.2)
+    # ── LM Head ────────────────────────────────────────────────────
+    draw_rounded_box(ax, 0, y - 0.1, 3.5, 0.7, COLORS["output"],
+                      "Linear (LM Head)", text_color="#000", fontsize=11,
+                      sublabel=f"{config.d_model} → {config.vocab_size:,} (weight-tied)")
+    y -= 1.1
+    draw_arrow(ax, 0, y + 0.7, 0, y + 0.35)
+    # ── Softmax / Output ───────────────────────────���───────────────
+    draw_rounded_box(ax, 0, y, 3.5, 0.6, COLORS["card_bg"],
+                      "Softmax → Next Token Probabilities", fontsize=10,
+                      sublabel=f"[batch, seq_len, {config.vocab_size:,}]")
+    plt.tight_layout()
+    if save_path:
+        fig.savefig(save_path, dpi=200, bbox_inches="tight",
+                    facecolor=COLORS["bg"], edgecolor="none")
+        print(f"Saved architecture diagram: {save_path}")
+    return fig
+# ═══════════════════════════════════════════════════════════════════════
+#  PARAMETER DISTRIBUTION CHART
+# ═══════════════════════════════════════════════════════════════════════
+def draw_parameter_chart(config: GPT300MConfig, save_path: str = None):
+    """Draw a parameter distribution breakdown."""
+    fig, axes = plt.subplots(1, 2, figsize=(16, 7), facecolor=COLORS["bg"])
+    # Calculate parameter counts per component
+    emb_params = config.vocab_size * config.d_model
+    attn_params = 4 * config.d_model * config.d_model * config.n_layers
+    ffn_params = 2 * config.d_model * config.d_ff * config.n_layers
+    norm_params = 2 * config.d_model * config.n_layers + config.d_model
+    total = emb_params + attn_params + ffn_params + norm_params
+    # ── Pie Chart ──────────────────────────────────────────────────
+    ax = axes[0]
+    ax.set_facecolor(COLORS["bg"])
+    labels = ["Token\nEmbedding", "Attention\nLayers", "Feed-Forward\nLayers", "LayerNorm"]
+    sizes = [emb_params, attn_params, ffn_params, norm_params]
+    colors = [COLORS["embed"], COLORS["attn"], COLORS["ffn"], COLORS["norm"]]
+    wedges, texts, autotexts = ax.pie(
+        sizes, labels=None, autopct=lambda p: f"{p:.1f}%",
+        colors=colors, startangle=90, pctdistance=0.7,
+        wedgeprops=dict(width=0.5, edgecolor=COLORS["bg"], linewidth=2),
+        textprops=dict(color=COLORS["text"], fontsize=10),
+    )
+    for at in autotexts:
+        at.set_fontweight("bold")
+        at.set_color("#000")
+    # Legend
+    legend_labels = [
+        f"{l}\n({s/1e6:.1f}M)" for l, s in zip(
+            ["Token Embedding", "Attention", "Feed-Forward", "LayerNorm"],
+            sizes
+        )
+    ]
+    ax.legend(
+        wedges, legend_labels, loc="center left", bbox_to_anchor=(1.05, 0.5),
+        fontsize=9, frameon=False, labelcolor=COLORS["text"],
+    )
+    ax.set_title("Parameter Distribution", fontsize=14, fontweight="bold",
+                  color=COLORS["text"], pad=15)
+    # ── Per-Layer Breakdown Bar Chart ──────────────────────────────
+    ax = axes[1]
+    ax.set_facecolor(COLORS["bg"])
+    layer_attn = 4 * config.d_model * config.d_model
+    layer_ffn = 2 * config.d_model * config.d_ff
+    layer_norm = 2 * config.d_model
+    layers = range(1, config.n_layers + 1)
+    bar_width = 0.8
+    ax.bar(layers, [layer_attn / 1e6] * config.n_layers, bar_width,
+           label="Attention", color=COLORS["attn"], alpha=0.9)
+    ax.bar(layers, [layer_ffn / 1e6] * config.n_layers, bar_width,
+           bottom=[layer_attn / 1e6] * config.n_layers,
+           label="Feed-Forward", color=COLORS["ffn"], alpha=0.9)
+    ax.bar(layers, [layer_norm / 1e6] * config.n_layers, bar_width,
+           bottom=[(layer_attn + layer_ffn) / 1e6] * config.n_layers,
+           label="Norm", color=COLORS["norm"], alpha=0.9)
+    ax.set_xlabel("Layer", fontsize=11, color=COLORS["text"])
+    ax.set_ylabel("Parameters (M)", fontsize=11, color=COLORS["text"])
+    ax.set_title("Parameters Per Layer", fontsize=14, fontweight="bold",
+                  color=COLORS["text"], pad=15)
+    ax.legend(fontsize=9, frameon=False, labelcolor=COLORS["text"])
+    ax.tick_params(colors=COLORS["text_dim"])
+    ax.spines["bottom"].set_color(COLORS["border"])
+    ax.spines["left"].set_color(COLORS["border"])
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    # Overall title
+    fig.suptitle(
+        f"GPT-300M • {total:,} Total Parameters",
+        fontsize=16, fontweight="bold", color=COLORS["text"],
+        fontfamily="monospace", y=1.02,
+    )
+    plt.tight_layout()
+    if save_path:
+        fig.savefig(save_path, dpi=200, bbox_inches="tight",
+                    facecolor=COLORS["bg"], edgecolor="none")
+        print(f"Saved parameter chart: {save_path}")
+    return fig
+# ═══════════════════════════════════════════════════════════════════════
+#  ATTENTION HEAD VISUALIZATION
+# ══���════════════════════════════════════════════════════════════════════
+def draw_attention_heads(config: GPT300MConfig, save_path: str = None):
+    """Visualize the multi-head attention mechanism."""
+    fig, ax = plt.subplots(1, 1, figsize=(14, 10), facecolor=COLORS["bg"])
+    ax.set_facecolor(COLORS["bg"])
+    ax.set_xlim(-1, 11)
+    ax.set_ylim(-1, 8)
+    ax.axis("off")
+    ax.text(5, 7.5, "Multi-Head Self-Attention", ha="center",
+            fontsize=18, fontweight="bold", color=COLORS["text"],
+            fontfamily="monospace")
+    ax.text(5, 7.0,
+            f"{config.n_heads} heads × {config.head_dim}d per head = {config.d_model}d total",
+            ha="center", fontsize=10, color=COLORS["text_dim"])
+    # Input
+    draw_rounded_box(ax, 5, 6.2, 4, 0.5, COLORS["embed"],
+                      f"Input: [B, T, {config.d_model}]", text_color="#000", fontsize=9)
+    # Q, K, V projections
+    for i, (name, color) in enumerate(zip(["Q", "K", "V"],
+                                           ["#FF6B6B", "#4ECDC4", "#45B7D1"])):
+        x = 2 + i * 3
+        draw_arrow(ax, 5, 5.9, x, 5.4)
+        draw_rounded_box(ax, x, 5.1, 1.8, 0.5, color,
+                          f"W_{name}", text_color="#000", fontsize=10,
+                          sublabel=f"{config.d_model}×{config.d_model}")
+    # Heads
+    head_y = 3.8
+    n_show = min(config.n_heads, 8)
+    head_spacing = 9.0 / n_show
+    for h in range(n_show):
+        hx = 1 + h * head_spacing
+        # Head box
+        box = FancyBboxPatch(
+            (hx - 0.4, head_y - 0.3), 0.8, 0.6,
+            boxstyle="round,pad=0.05",
+            facecolor=COLORS["attn"],
+            edgecolor="white",
+            linewidth=0.5,
+            alpha=0.8,
+            zorder=3,
+        )
+        ax.add_patch(box)
+        ax.text(hx, head_y, f"H{h+1}", ha="center", va="center",
+                fontsize=8, fontweight="bold", color="#000", zorder=4)
+        # Arrows from Q,K,V to heads
+        for qi, qx in enumerate([2, 5, 8]):
+            ax.annotate("", xy=(hx, head_y + 0.3), xytext=(qx, 4.8),
+                        arrowprops=dict(arrowstyle="-", color=COLORS["arrow"],
+                                        lw=0.3, alpha=0.3), zorder=1)
+    if config.n_heads > 8:
+        ax.text(5, head_y - 0.6, f"... ({config.n_heads} heads total)",
+                ha="center", fontsize=9, color=COLORS["text_dim"])
+    # Attention computation
+    draw_rounded_box(ax, 5, 2.5, 6, 0.6, COLORS["card_bg"],
+                      "Scaled Dot-Product: softmax(QK^T / √d_k) × V",
+                      fontsize=10)
+    for h in range(n_show):
+        hx = 1 + h * head_spacing
+        draw_arrow(ax, hx, head_y - 0.3, 5, 2.85)
+    # Concatenate
+    draw_arrow(ax, 5, 2.15, 5, 1.75)
+    draw_rounded_box(ax, 5, 1.5, 4, 0.5, COLORS["accent1"],
+                      "Concat → W_O projection", text_color="#000", fontsize=10)
+    # Output
+    draw_arrow(ax, 5, 1.2, 5, 0.8)
+    draw_rounded_box(ax, 5, 0.5, 4, 0.5, COLORS["ffn"],
+                      f"Output: [B, T, {config.d_model}]", text_color="#000", fontsize=9)
+    plt.tight_layout()
+    if save_path:
+        fig.savefig(save_path, dpi=200, bbox_inches="tight",
+                    facecolor=COLORS["bg"], edgecolor="none")
+        print(f"Saved attention diagram: {save_path}")
+    return fig
+# ═══════════════════════════════════════════════════════════════════════
+#  MAIN
+# ═══════════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Visualize GPT-300M Architecture")
+    parser.add_argument("--output", type=str, default="./viz",
+                        help="Output directory for images")
+    args = parser.parse_args()
+    import os
+    os.makedirs(args.output, exist_ok=True)
+    config = gpt_300m()
+    print(f"Generating visualizations for GPT-300M ({config.total_params_estimate:,} params)...")
+    draw_full_architecture(config, os.path.join(args.output, "architecture.png"))
+    draw_parameter_chart(config, os.path.join(args.output, "parameters.png"))
+    draw_attention_heads(config, os.path.join(args.output, "attention.png"))
+    print("Done! All visualizations saved.")