Spaces:

GPUburnout
/

gpuburnout-models

Running

App Files Files Community

GPUburnout commited on Mar 7

Commit

36bc78f

1 Parent(s): 514a6e1

feat: add app code, configs, and tokenizers

Browse files

Files changed (16) hide show

README.md +15 -7
app.py +330 -0
checkpoints/gpt2_small/NOTE.txt +4 -0
checkpoints/gpt2_small/config.json +13 -0
checkpoints/gpt2_small/tokenizer.json +0 -0
checkpoints/llama_1b/config.json +12 -0
checkpoints/llama_1b/metadata.json +14 -0
checkpoints/tiny/config.json +13 -0
checkpoints/tiny/tokenizer.json +138 -0
models/__init__.py +0 -0
models/s1_model.py +403 -0
models/s1_tokenizer_bpe.py +125 -0
models/s1_tokenizer_char.py +202 -0
models/s2_model.py +785 -0
requirements.txt +4 -0
tokenizer/bpe_tokenizer.json +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,21 @@
 ---
-title: Gpuburnout Models
-emoji: 📈
-colorFrom: purple
 colorTo: blue
 sdk: gradio
-sdk_version: 6.9.0
 app_file: app.py
-pinned: false
-short_description: Compare language models trained from scratch
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: GPUburnout Models
+emoji: 🔥
+colorFrom: gray
 colorTo: blue
 sdk: gradio
+sdk_version: 5.12.0
 app_file: app.py
+pinned: true
+license: mit
 ---
+# GPUburnout Models — Interactive Demo
+Compare language models trained from scratch across two seasons:
+- **Tiny Shakespeare** (3.2M params) — Character-level, trained on Shakespeare
+- **GPT-2 Small** (134M params) — BPE tokenizer, trained on 2.8B tokens
+- **Llama 1B** (1.04B params) — Llama architecture, trained on 30B tokens for $175
+Built by [Jun Park](https://gpuburnout.com/about/) | [Read the blog](https://gpuburnout.com) | [GitHub](https://github.com/GPUburnout)

app.py ADDED Viewed

	@@ -0,0 +1,330 @@

+"""
+GPUburnout Models — Unified Demo
+Compare models trained from scratch: Tiny (3.2M) → GPT-2 (134M) → Llama (1B)
+"""
+import gc
+import json
+import os
+import sys
+import gradio as gr
+import torch
+import torch.nn.functional as F
+# Add models directory to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "models"))
+# ── Model Registry ──────────────────────────────────────────────────────────
+MODELS = {
+    "Tiny Shakespeare (3.2M)": {
+        "path": "checkpoints/tiny",
+        "arch": "s1",
+        "description": "Character-level model trained on Shakespeare. The very first step.",
+        "examples": ["ROMEO:", "JULIET:", "To be, or not to be", "First Citizen:"],
+    },
+    "GPT-2 Small (134M)": {
+        "path": "checkpoints/gpt2_small",
+        "arch": "s1",
+        "description": "Season 1 final model. BPE tokenizer, 2.8B tokens, 12 layers.",
+        "examples": [
+            "The capital of France is",
+            "Explain machine learning in simple terms.",
+            "def fibonacci(n):",
+            "The meaning of life is",
+        ],
+    },
+    "Llama 1B (1.04B)": {
+        "path": "checkpoints/llama_1b",
+        "arch": "s2",
+        "description": "Season 2. Llama architecture, 30B tokens, $175 total. Final loss 2.494.",
+        "examples": [
+            "The capital of France is",
+            "In a shocking discovery, scientists found that",
+            "def fibonacci(n):",
+            "Once upon a time, in a land far away,",
+        ],
+    },
+}
+# ── Current model state (one at a time) ─────────────────────────────────────
+current = {"name": None, "model": None, "tokenizer": None, "config": None}
+def unload_current():
+    """Free the currently loaded model from memory."""
+    if current["model"] is not None:
+        del current["model"]
+        current["model"] = None
+        current["tokenizer"] = None
+        current["config"] = None
+        current["name"] = None
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+def load_model(model_name):
+    """Load a model by name, unloading the previous one first."""
+    if current["name"] == model_name and current["model"] is not None:
+        return current["model"], current["tokenizer"], current["config"]
+    unload_current()
+    info = MODELS[model_name]
+    model_dir = info["path"]
+    config_path = os.path.join(model_dir, "config.json")
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Model not found: {model_dir}")
+    with open(config_path) as f:
+        config = json.load(f)
+    if info["arch"] == "s1":
+        model, tokenizer = _load_s1(model_dir, config)
+    else:
+        model, tokenizer = _load_s2(model_dir, config)
+    current["name"] = model_name
+    current["model"] = model
+    current["tokenizer"] = tokenizer
+    current["config"] = config
+    return model, tokenizer, config
+def _load_s1(model_dir, config):
+    """Load Season 1 GPT-2 style model."""
+    from s1_model import TransformerLanguageModel
+    model = TransformerLanguageModel(
+        vocab_size=config["vocab_size"],
+        embed_dim=config["embed_dim"],
+        num_heads=config["num_heads"],
+        num_layers=config["num_layers"],
+        ff_dim=config["ff_dim"],
+        max_seq_len=config["max_seq_len"],
+        dropout=0.0,
+    )
+    weights_path = os.path.join(model_dir, "pytorch_model.bin")
+    model.load_state_dict(torch.load(weights_path, map_location="cpu"))
+    model.eval()
+    # Load tokenizer
+    tokenizer_type = config.get("tokenizer_type", "character")
+    tokenizer_path = os.path.join(model_dir, "tokenizer.json")
+    if tokenizer_type == "bpe":
+        from s1_tokenizer_bpe import BPETokenizer
+        tokenizer = BPETokenizer()
+        tokenizer.load(tokenizer_path)
+    else:
+        from s1_tokenizer_char import CharacterTokenizer
+        tokenizer = CharacterTokenizer()
+        tokenizer.load(tokenizer_path)
+    return model, tokenizer
+def _load_s2(model_dir, config):
+    """Load Season 2 Llama style model."""
+    from s2_model import LlamaModel, ModelConfig
+    model_config = ModelConfig(
+        vocab_size=config.get("vocab_size", 32005),
+        d_model=config.get("d_model", 2048),
+        n_layers=config.get("n_layers", 16),
+        n_heads=config.get("n_heads", 32),
+        n_kv_heads=config.get("n_kv_heads", 8),
+        d_ff=config.get("d_ff", 8192),
+        max_seq_len=config.get("max_seq_len", 2048),
+    )
+    model = LlamaModel(model_config).to("cpu")
+    weights_path = os.path.join(model_dir, "pytorch_model.bin")
+    state_dict = torch.load(weights_path, map_location="cpu", weights_only=True)
+    model.load_state_dict(state_dict)
+    model.eval()
+    # S2 uses HuggingFace tokenizers library
+    from tokenizers import Tokenizer
+    tokenizer = Tokenizer.from_file("tokenizer/bpe_tokenizer.json")
+    return model, tokenizer
+# ── Generation ──────────────────────────────────────────────────────────────
+def generate_s1(model, tokenizer, config, prompt, max_tokens, temperature, top_k):
+    """Generate text with S1 (GPT-2) model."""
+    tokens = tokenizer.encode(prompt)
+    if not tokens:
+        return "Could not encode prompt."
+    tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
+    max_seq_len = config.get("max_seq_len", 256)
+    with torch.no_grad():
+        for _ in range(max_tokens):
+            inp = tokens[:, -max_seq_len:] if tokens.size(1) > max_seq_len else tokens
+            logits = model(inp)[:, -1, :] / temperature
+            if top_k > 0:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = float("-inf")
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            tokens = torch.cat([tokens, next_token], dim=1)
+    return tokenizer.decode(tokens[0].tolist())
+def generate_s2(model, tokenizer, prompt, max_tokens, temperature, top_k):
+    """Generate text with S2 (Llama) model."""
+    encoded = tokenizer.encode(prompt)
+    input_ids = torch.tensor([encoded.ids], dtype=torch.long)
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_k=top_k if top_k > 0 else None,
+        )
+    return tokenizer.decode(output_ids[0].tolist())
+def generate_text(model_name, prompt, max_tokens, temperature, top_k):
+    """Main generation entry point."""
+    if not prompt.strip():
+        return "Please enter a prompt."
+    try:
+        model, tokenizer, config = load_model(model_name)
+    except FileNotFoundError as e:
+        return f"Error: {e}"
+    info = MODELS[model_name]
+    if info["arch"] == "s1":
+        return generate_s1(model, tokenizer, config, prompt, int(max_tokens), temperature, int(top_k))
+    else:
+        return generate_s2(model, tokenizer, prompt, int(max_tokens), temperature, int(top_k))
+def get_status(model_name):
+    """Return status string for the selected model."""
+    info = MODELS[model_name]
+    loaded = "Loaded" if current["name"] == model_name else "Not loaded (will load on generate)"
+    return f"**{model_name}** — {info['description']}\n\nStatus: {loaded}"
+def update_examples(model_name):
+    """Return example prompts for the selected model."""
+    return gr.update(samples=[[ex] for ex in MODELS[model_name]["examples"]])
+# ── Custom CSS ──────────────────────────────────────────────────────────────
+CUSTOM_CSS = """
+.gradio-container {
+    max-width: 900px !important;
+    margin: auto;
+}
+.header-text {
+    text-align: center;
+    margin-bottom: 0.5em;
+}
+.header-text h1 {
+    color: #22d3ee;
+    font-family: 'Courier New', monospace;
+}
+.header-text a {
+    color: #f59e0b;
+}
+.model-info {
+    font-family: 'Courier New', monospace;
+    font-size: 0.85em;
+    padding: 10px;
+    border-radius: 8px;
+    background: rgba(34, 211, 238, 0.05);
+    border: 1px solid rgba(34, 211, 238, 0.15);
+}
+"""
+# ── Gradio UI ───────────────────────────────────────────────────────────────
+with gr.Blocks(
+    title="GPUburnout Models",
+    theme=gr.themes.Base(
+        primary_hue="cyan",
+        neutral_hue="gray",
+        font=gr.themes.GoogleFont("JetBrains Mono"),
+    ),
+    css=CUSTOM_CSS,
+) as demo:
+    gr.HTML("""
+    <div class="header-text">
+        <h1>GPUburnout Models</h1>
+        <p>Compare language models I trained from scratch — from 3.2M to 1 billion parameters.</p>
+        <p>
+            <a href="https://gpuburnout.com" target="_blank">Read the blog</a> ·
+            <a href="https://github.com/GPUburnout" target="_blank">GitHub</a> ·
+            <a href="https://gpuburnout.com/about/" target="_blank">About</a>
+        </p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_selector = gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value="Llama 1B (1.04B)",
+                label="Select Model",
+            )
+            model_status = gr.Markdown(elem_classes=["model-info"])
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="Type something...",
+                lines=2,
+                value="The capital of France is",
+            )
+            with gr.Row():
+                max_tokens = gr.Slider(50, 300, value=100, step=25, label="Max tokens")
+                temperature = gr.Slider(0.1, 1.5, value=0.8, step=0.1, label="Temperature")
+            top_k = gr.Slider(1, 100, value=50, step=1, label="Top-K")
+            generate_btn = gr.Button("Generate", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            output = gr.Textbox(label="Output", lines=15, show_copy_button=True)
+    examples = gr.Examples(
+        examples=[["The capital of France is"], ["def fibonacci(n):"]],
+        inputs=prompt,
+        label="Example prompts",
+    )
+    # Events
+    demo.load(get_status, inputs=model_selector, outputs=model_status)
+    model_selector.change(get_status, inputs=model_selector, outputs=model_status)
+    model_selector.change(update_examples, inputs=model_selector, outputs=examples.dataset)
+    generate_btn.click(
+        generate_text,
+        inputs=[model_selector, prompt, max_tokens, temperature, top_k],
+        outputs=output,
+    )
+    prompt.submit(
+        generate_text,
+        inputs=[model_selector, prompt, max_tokens, temperature, top_k],
+        outputs=output,
+    )
+if __name__ == "__main__":
+    demo.launch()

checkpoints/gpt2_small/NOTE.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+This model is from checkpoint_epoch_7 (not the final model).
+Training was still in progress - this represents ~70% through training.
+Final model would be checkpoint_epoch_10.

checkpoints/gpt2_small/config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "vocab_size": 32000,
+  "embed_dim": 768,
+  "num_heads": 12,
+  "num_layers": 12,
+  "ff_dim": 3072,
+  "max_seq_len": 512,
+  "dropout": 0.1,
+  "model_type": "TransformerLanguageModel",
+  "architecture": "gpt2_small",
+  "total_parameters": 134601216,
+  "tokenizer_type": "bpe"
+}

checkpoints/gpt2_small/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/llama_1b/config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "model_size": "1B",
+  "vocab_size": 32005,
+  "d_model": 2048,
+  "n_layers": 16,
+  "n_heads": 32,
+  "n_kv_heads": 8,
+  "d_ff": 8192,
+  "max_seq_len": 2048,
+  "total_parameters": 1040000000,
+  "tokenizer_type": "bpe"
+}

checkpoints/llama_1b/metadata.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "step": 90000,
+  "loss": 2.494209110736847,
+  "tokens_processed": 11796480000,
+  "best_val_loss": 2.539955945014954,
+  "phase_complete": true,
+  "source_file": "milestone_step_00090000.pt",
+  "export_date": "2026-03-03 02:24:18",
+  "model_weights_file": "pytorch_model.bin",
+  "model_weights_gb": 4.15,
+  "optimizer_state_file": "optimizer_state.bin",
+  "optimizer_state_gb": 8.31,
+  "original_checkpoint_gb": 12.46
+}

checkpoints/tiny/config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "vocab_size": 65,
+  "embed_dim": 256,
+  "num_heads": 4,
+  "num_layers": 4,
+  "ff_dim": 1024,
+  "max_seq_len": 256,
+  "dropout": 0.1,
+  "total_parameters": 3258368,
+  "tokenizer_type": "character",
+  "model_name": "tiny_shakespeare",
+  "description": "Phase 1 model trained on Shakespeare text (character-level)"
+}

checkpoints/tiny/tokenizer.json ADDED Viewed

	@@ -0,0 +1,138 @@

+{
+  "type": "character",
+  "vocab_size": 65,
+  "char_to_idx": {
+    "\n": 0,
+    " ": 1,
+    "!": 2,
+    "$": 3,
+    "&": 4,
+    "'": 5,
+    ",": 6,
+    "-": 7,
+    ".": 8,
+    "3": 9,
+    ":": 10,
+    ";": 11,
+    "?": 12,
+    "A": 13,
+    "B": 14,
+    "C": 15,
+    "D": 16,
+    "E": 17,
+    "F": 18,
+    "G": 19,
+    "H": 20,
+    "I": 21,
+    "J": 22,
+    "K": 23,
+    "L": 24,
+    "M": 25,
+    "N": 26,
+    "O": 27,
+    "P": 28,
+    "Q": 29,
+    "R": 30,
+    "S": 31,
+    "T": 32,
+    "U": 33,
+    "V": 34,
+    "W": 35,
+    "X": 36,
+    "Y": 37,
+    "Z": 38,
+    "a": 39,
+    "b": 40,
+    "c": 41,
+    "d": 42,
+    "e": 43,
+    "f": 44,
+    "g": 45,
+    "h": 46,
+    "i": 47,
+    "j": 48,
+    "k": 49,
+    "l": 50,
+    "m": 51,
+    "n": 52,
+    "o": 53,
+    "p": 54,
+    "q": 55,
+    "r": 56,
+    "s": 57,
+    "t": 58,
+    "u": 59,
+    "v": 60,
+    "w": 61,
+    "x": 62,
+    "y": 63,
+    "z": 64
+  },
+  "idx_to_char": {
+    "0": "\n",
+    "1": " ",
+    "2": "!",
+    "3": "$",
+    "4": "&",
+    "5": "'",
+    "6": ",",
+    "7": "-",
+    "8": ".",
+    "9": "3",
+    "10": ":",
+    "11": ";",
+    "12": "?",
+    "13": "A",
+    "14": "B",
+    "15": "C",
+    "16": "D",
+    "17": "E",
+    "18": "F",
+    "19": "G",
+    "20": "H",
+    "21": "I",
+    "22": "J",
+    "23": "K",
+    "24": "L",
+    "25": "M",
+    "26": "N",
+    "27": "O",
+    "28": "P",
+    "29": "Q",
+    "30": "R",
+    "31": "S",
+    "32": "T",
+    "33": "U",
+    "34": "V",
+    "35": "W",
+    "36": "X",
+    "37": "Y",
+    "38": "Z",
+    "39": "a",
+    "40": "b",
+    "41": "c",
+    "42": "d",
+    "43": "e",
+    "44": "f",
+    "45": "g",
+    "46": "h",
+    "47": "i",
+    "48": "j",
+    "49": "k",
+    "50": "l",
+    "51": "m",
+    "52": "n",
+    "53": "o",
+    "54": "p",
+    "55": "q",
+    "56": "r",
+    "57": "s",
+    "58": "t",
+    "59": "u",
+    "60": "v",
+    "61": "w",
+    "62": "x",
+    "63": "y",
+    "64": "z"
+  }
+}

models/__init__.py ADDED Viewed

File without changes

models/s1_model.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""
+Transformer Language Model Architecture
+Modern architecture (GPT-style) scalable from tiny to large
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import json
+import os
+import math
+class MultiHeadAttention(nn.Module):
+    """Multi-head self-attention mechanism with Flash Attention support"""
+    def __init__(self, embed_dim, num_heads, dropout=0.1):
+        super().__init__()
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.dropout_p = dropout
+        # Q, K, V projections
+        self.qkv = nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        # Check if Flash Attention is available (PyTorch 2.0+)
+        self.use_flash = hasattr(F, 'scaled_dot_product_attention')
+        # Fallback dropout for non-flash path
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        batch_size, seq_len, embed_dim = x.shape
+        # Compute Q, K, V
+        qkv = self.qkv(x)  # (batch, seq, 3*embed_dim)
+        qkv = qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim)
+        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, batch, heads, seq, head_dim)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        if self.use_flash:
+            # Use PyTorch's scaled_dot_product_attention (Flash Attention when available)
+            # This is 1.5-2x faster and more memory efficient
+            dropout_p = self.dropout_p if self.training else 0.0
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=None,  # We use is_causal instead
+                dropout_p=dropout_p,
+                is_causal=True  # Causal mask for autoregressive generation
+            )
+        else:
+            # Fallback to manual attention for older PyTorch versions
+            scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+            # Apply causal mask (for autoregressive generation)
+            if mask is not None:
+                scores = scores.masked_fill(mask == 0, float('-inf'))
+            # Attention weights
+            attn = F.softmax(scores, dim=-1)
+            attn = self.dropout(attn)
+            # Apply attention to values
+            out = torch.matmul(attn, v)
+        # Reshape: (batch, heads, seq, head_dim) -> (batch, seq, embed_dim)
+        out = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, embed_dim)
+        # Output projection
+        out = self.out_proj(out)
+        return out
+class FeedForward(nn.Module):
+    """Position-wise feed-forward network"""
+    def __init__(self, embed_dim, ff_dim, dropout=0.1):
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dim, ff_dim)
+        self.fc2 = nn.Linear(ff_dim, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = F.gelu(self.fc1(x))
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return x
+class TransformerBlock(nn.Module):
+    """Single Transformer block (attention + feed-forward)"""
+    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
+        super().__init__()
+        self.attention = MultiHeadAttention(embed_dim, num_heads, dropout)
+        self.feed_forward = FeedForward(embed_dim, ff_dim, dropout)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        # Self-attention with residual connection
+        attn_out = self.attention(self.norm1(x), mask)
+        x = x + self.dropout(attn_out)
+        # Feed-forward with residual connection
+        ff_out = self.feed_forward(self.norm2(x))
+        x = x + self.dropout(ff_out)
+        return x
+class TransformerLanguageModel(nn.Module):
+    """
+    GPT-style Transformer Language Model
+    Scalable from tiny (CPU) to large (GPU cluster)
+    """
+    def __init__(self, vocab_size, embed_dim=256, num_heads=4, num_layers=4,
+                 ff_dim=None, max_seq_len=256, dropout=0.1):
+        """
+        Initialize Transformer model
+        Args:
+            vocab_size: Number of tokens in vocabulary
+            embed_dim: Embedding dimension (must be divisible by num_heads)
+            num_heads: Number of attention heads
+            num_layers: Number of Transformer blocks
+            ff_dim: Feed-forward dimension (default: 4 * embed_dim)
+            max_seq_len: Maximum sequence length
+            dropout: Dropout probability
+        """
+        super().__init__()
+        if ff_dim is None:
+            ff_dim = 4 * embed_dim
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.ff_dim = ff_dim
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+        # Token embeddings
+        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
+        # Positional embeddings (learned)
+        self.positional_embedding = nn.Embedding(max_seq_len, embed_dim)
+        # Transformer blocks
+        self.blocks = nn.ModuleList([
+            TransformerBlock(embed_dim, num_heads, ff_dim, dropout)
+            for _ in range(num_layers)
+        ])
+        # Final layer norm
+        self.ln_f = nn.LayerNorm(embed_dim)
+        # Output projection
+        self.head = nn.Linear(embed_dim, vocab_size, bias=False)
+        # Dropout
+        self.dropout_layer = nn.Dropout(dropout)
+        # Initialize weights
+        self._init_weights()
+        # Create causal mask
+        self.register_buffer("causal_mask", self._create_causal_mask(max_seq_len))
+    def _init_weights(self):
+        """Initialize weights"""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def _create_causal_mask(self, seq_len):
+        """Create causal mask for autoregressive generation"""
+        mask = torch.tril(torch.ones(seq_len, seq_len))
+        mask = mask.view(1, 1, seq_len, seq_len)
+        return mask
+    def forward(self, x):
+        """
+        Forward pass
+        Args:
+            x: Input tensor of shape (batch_size, seq_len)
+        Returns:
+            logits: Output logits of shape (batch_size, seq_len, vocab_size)
+        """
+        batch_size, seq_len = x.shape
+        device = x.device
+        # Token embeddings
+        token_emb = self.token_embedding(x)  # (batch, seq_len, embed_dim)
+        # Positional embeddings
+        positions = torch.arange(seq_len, device=device).unsqueeze(0)
+        pos_emb = self.positional_embedding(positions)  # (1, seq_len, embed_dim)
+        # Combine embeddings
+        x = self.dropout_layer(token_emb + pos_emb)
+        # Get causal mask for this sequence length
+        mask = self.causal_mask[:, :, :seq_len, :seq_len]
+        # Apply Transformer blocks
+        for block in self.blocks:
+            x = block(x, mask)
+        # Final layer norm
+        x = self.ln_f(x)
+        # Output logits
+        logits = self.head(x)  # (batch, seq_len, vocab_size)
+        return logits
+    def count_parameters(self):
+        """Count trainable parameters"""
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+    def get_config(self):
+        """Get model configuration"""
+        return {
+            'model_type': 'Transformer',
+            'architecture': 'GPT-style (decoder-only)',
+            'vocab_size': self.vocab_size,
+            'embed_dim': self.embed_dim,
+            'num_heads': self.num_heads,
+            'num_layers': self.num_layers,
+            'ff_dim': self.ff_dim,
+            'max_seq_len': self.max_seq_len,
+            'dropout': self.dropout,
+            'total_parameters': self.count_parameters()
+        }
+    def save_config(self, filepath='models/model_config.json'):
+        """Save model configuration"""
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        config = self.get_config()
+        with open(filepath, 'w') as f:
+            json.dump(config, f, indent=2)
+        print(f"Model config saved to: {filepath}")
+        return filepath
+def create_tiny_transformer(vocab_size):
+    """Create a tiny Transformer (fastest on CPU)"""
+    return TransformerLanguageModel(
+        vocab_size=vocab_size,
+        embed_dim=128,
+        num_heads=4,
+        num_layers=2,
+        max_seq_len=128,
+        dropout=0.1
+    )
+def create_small_transformer(vocab_size):
+    """Create a small Transformer (recommended for first run)"""
+    return TransformerLanguageModel(
+        vocab_size=vocab_size,
+        embed_dim=256,
+        num_heads=4,
+        num_layers=4,
+        max_seq_len=256,
+        dropout=0.1
+    )
+def create_medium_transformer(vocab_size):
+    """Create a medium Transformer (GPU recommended)"""
+    return TransformerLanguageModel(
+        vocab_size=vocab_size,
+        embed_dim=512,
+        num_heads=8,
+        num_layers=6,
+        max_seq_len=512,
+        dropout=0.1
+    )
+def create_large_transformer(vocab_size):
+    """Create a large Transformer (GPU cluster)"""
+    return TransformerLanguageModel(
+        vocab_size=vocab_size,
+        embed_dim=1024,
+        num_heads=16,
+        num_layers=12,
+        max_seq_len=1024,
+        dropout=0.1
+    )
+def main():
+    """Test model creation"""
+    print("\n" + "="*80)
+    print("TRANSFORMER MODEL ARCHITECTURE")
+    print("="*80)
+    # Load tokenizer to get vocab size
+    tokenizer_path = 'models/tokenizer.json'
+    if not os.path.exists(tokenizer_path):
+        print(f"\nError: Tokenizer not found at {tokenizer_path}")
+        print("Please run tokenizer.py first.")
+        return
+    with open(tokenizer_path, 'r') as f:
+        tokenizer_data = json.load(f)
+        vocab_size = tokenizer_data['vocab_size']
+    print(f"\nVocabulary size: {vocab_size}")
+    print("Architecture: GPT-style Transformer (decoder-only)")
+    # Create models of different sizes
+    print("\n" + "-"*80)
+    print("TINY TRANSFORMER (fastest on CPU)")
+    print("-"*80)
+    tiny_model = create_tiny_transformer(vocab_size)
+    print(f"Parameters: {tiny_model.count_parameters():,}")
+    print(f"Embed dim: {tiny_model.embed_dim}")
+    print(f"Attention heads: {tiny_model.num_heads}")
+    print(f"Layers: {tiny_model.num_layers}")
+    print(f"Context length: {tiny_model.max_seq_len}")
+    print("\n" + "-"*80)
+    print("SMALL TRANSFORMER (recommended for first run)")
+    print("-"*80)
+    small_model = create_small_transformer(vocab_size)
+    print(f"Parameters: {small_model.count_parameters():,}")
+    print(f"Embed dim: {small_model.embed_dim}")
+    print(f"Attention heads: {small_model.num_heads}")
+    print(f"Layers: {small_model.num_layers}")
+    print(f"Context length: {small_model.max_seq_len}")
+    print("\n" + "-"*80)
+    print("MEDIUM TRANSFORMER (GPU recommended)")
+    print("-"*80)
+    medium_model = create_medium_transformer(vocab_size)
+    print(f"Parameters: {medium_model.count_parameters():,}")
+    print(f"Embed dim: {medium_model.embed_dim}")
+    print(f"Attention heads: {medium_model.num_heads}")
+    print(f"Layers: {medium_model.num_layers}")
+    print(f"Context length: {medium_model.max_seq_len}")
+    # Use small model for our tiny LM
+    print("\n" + "="*80)
+    print("SELECTED MODEL: SMALL TRANSFORMER")
+    print("="*80)
+    print("Good balance for CPU training with modern architecture")
+    model = small_model
+    # Test forward pass
+    print("\nTesting forward pass...")
+    batch_size = 4
+    seq_len = 32
+    dummy_input = torch.randint(0, vocab_size, (batch_size, seq_len))
+    with torch.no_grad():
+        logits = model(dummy_input)
+    print(f"Input shape: {dummy_input.shape}")
+    print(f"Output shape: {logits.shape}")
+    print(f"Expected: (batch={batch_size}, seq_len={seq_len}, vocab={vocab_size})")
+    assert logits.shape == (batch_size, seq_len, vocab_size), "Shape mismatch!"
+    print("Forward pass test passed!")
+    # Save configuration
+    model.save_config()
+    print("\n" + "="*80)
+    print("MODEL CREATION COMPLETE")
+    print("="*80)
+    print(f"\nModel ready for training!")
+    print(f"Architecture: {model.get_config()['model_type']}")
+    print(f"Total parameters: {model.count_parameters():,}")
+    print(f"Configuration saved to: models/model_config.json")
+    print(f"\nNext step: Implement the training loop")
+    print("="*80 + "\n")
+if __name__ == "__main__":
+    main()

models/s1_tokenizer_bpe.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""
+BPE Tokenizer Wrapper
+=====================
+Wraps HuggingFace `tokenizers` library to provide the same interface
+as CharacterTokenizer. Uses byte-level BPE (GPT-2 style).
+Requires: pip install tokenizers
+"""
+import json
+import os
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.decoders import ByteLevel as ByteLevelDecoder
+class BPETokenizer:
+    """Byte-level BPE tokenizer compatible with CharacterTokenizer interface."""
+    def __init__(self):
+        self.tokenizer = None
+        self._vocab_size = 0
+    def build_vocab_from_file(self, filepath, vocab_size=32000,
+                              min_frequency=2, chunk_size=None):
+        """Train BPE tokenizer on a text file.
+        Args:
+            filepath: Path to text file
+            vocab_size: Target vocabulary size (default: 32000)
+            min_frequency: Minimum token frequency (default: 2)
+            chunk_size: Unused, kept for interface compatibility
+        """
+        tokenizer = Tokenizer(BPE(unk_token="<|unk|>"))
+        tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
+        tokenizer.decoder = ByteLevelDecoder()
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            min_frequency=min_frequency,
+            special_tokens=["<|endoftext|>", "<|pad|>", "<|unk|>"],
+            show_progress=True
+        )
+        file_size = os.path.getsize(filepath)
+        print(f"\nTraining BPE tokenizer on: {filepath}")
+        print(f"File size: {file_size / (1024**3):.2f} GB")
+        print(f"Target vocab size: {vocab_size:,}")
+        print(f"Min frequency: {min_frequency}")
+        tokenizer.train(files=[filepath], trainer=trainer)
+        self.tokenizer = tokenizer
+        self._vocab_size = tokenizer.get_vocab_size()
+        print(f"\nBPE vocabulary built: {self._vocab_size:,} tokens")
+        # Show some sample tokens
+        vocab = tokenizer.get_vocab()
+        sample = sorted(vocab.items(), key=lambda x: x[1])[:20]
+        sample_str = ', '.join(f"'{k}'" for k, v in sample)
+        print(f"Sample tokens: {sample_str}")
+        return self._vocab_size
+    def encode(self, text):
+        """Encode text to list of token IDs.
+        Args:
+            text: Input string
+        Returns:
+            List of integer token IDs
+        """
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not initialized. "
+                             "Call build_vocab_from_file() or load() first.")
+        return self.tokenizer.encode(text).ids
+    def decode(self, tokens):
+        """Decode token IDs back to text.
+        Args:
+            tokens: List of integer token IDs
+        Returns:
+            Decoded string
+        """
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not initialized.")
+        return self.tokenizer.decode(tokens)
+    @property
+    def vocab_size(self):
+        """Number of tokens in vocabulary."""
+        return self._vocab_size
+    def save(self, filepath):
+        """Save tokenizer to a JSON file.
+        Args:
+            filepath: Path to save tokenizer (e.g. 'bpe_tokenizer.json')
+        """
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not initialized.")
+        self.tokenizer.save(filepath)
+        print(f"\nBPE tokenizer saved to: {filepath}")
+    def load(self, filepath):
+        """Load tokenizer from a JSON file.
+        Args:
+            filepath: Path to tokenizer JSON file
+        """
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"Tokenizer file not found: {filepath}")
+        self.tokenizer = Tokenizer.from_file(filepath)
+        self._vocab_size = self.tokenizer.get_vocab_size()
+        print(f"BPE tokenizer loaded: {self._vocab_size:,} tokens")
+        return self

models/s1_tokenizer_char.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+Tokenizer for Language Model
+Converts text to numbers (tokens) and back
+"""
+import json
+import os
+class CharacterTokenizer:
+    """Simple character-level tokenizer for tiny language models"""
+    def __init__(self):
+        """Initialize tokenizer"""
+        self.char_to_idx = {}
+        self.idx_to_char = {}
+        self.vocab_size = 0
+    def build_vocab(self, text):
+        """Build vocabulary from text"""
+        print("\nBuilding character vocabulary...")
+        # Get unique characters and sort them
+        chars = sorted(set(text))
+        self.vocab_size = len(chars)
+        # Create mappings
+        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
+        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
+        print(f"Vocabulary size: {self.vocab_size} characters")
+        print(f"Characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))
+        return self.vocab_size
+    def build_vocab_from_file(self, filepath, chunk_size=100*1024*1024):
+        """Build vocabulary from a large file using streaming (memory-efficient)
+        Args:
+            filepath: Path to text file
+            chunk_size: Size of chunks to read (default: 100MB)
+        """
+        print(f"\nBuilding character vocabulary from file: {filepath}")
+        print(f"Chunk size: {chunk_size / (1024*1024):.0f}MB")
+        # Get file size
+        file_size = os.path.getsize(filepath)
+        file_size_gb = file_size / (1024**3)
+        print(f"File size: {file_size_gb:.2f} GB")
+        # Collect unique characters by reading file in chunks
+        unique_chars = set()
+        total_read = 0
+        with open(filepath, 'r', encoding='utf-8') as f:
+            while True:
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break
+                # Add unique characters from this chunk
+                unique_chars.update(chunk)
+                total_read += len(chunk)
+                # Progress update (calculate based on character count)
+                progress_pct = (total_read / (file_size / 1.5)) * 100  # Approximate chars from bytes
+                if progress_pct <= 100:
+                    print(f"  Progress: {progress_pct:.1f}% | Unique chars found: {len(unique_chars)}", end='\r')
+        print()  # New line after progress
+        # Sort characters and build mappings
+        chars = sorted(unique_chars)
+        self.vocab_size = len(chars)
+        # Create mappings
+        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
+        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
+        print(f"\nVocabulary size: {self.vocab_size} characters")
+        print(f"Sample characters: {''.join(chars[:50])}" + ("..." if len(chars) > 50 else ""))
+        return self.vocab_size
+    def encode(self, text):
+        """Convert text to list of token IDs"""
+        return [self.char_to_idx[ch] for ch in text if ch in self.char_to_idx]
+    def decode(self, tokens):
+        """Convert list of token IDs back to text"""
+        return ''.join([self.idx_to_char[idx] for idx in tokens if idx in self.idx_to_char])
+    def save(self, filepath='models/tokenizer.json'):
+        """Save tokenizer to JSON file"""
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        tokenizer_data = {
+            'type': 'character',
+            'vocab_size': self.vocab_size,
+            'char_to_idx': self.char_to_idx,
+            'idx_to_char': {str(k): v for k, v in self.idx_to_char.items()}
+        }
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(tokenizer_data, f, indent=2, ensure_ascii=False)
+        print(f"\nTokenizer saved to: {filepath}")
+        return filepath
+    def load(self, filepath='models/tokenizer.json'):
+        """Load tokenizer from JSON file"""
+        with open(filepath, 'r', encoding='utf-8') as f:
+            tokenizer_data = json.load(f)
+        self.vocab_size = tokenizer_data['vocab_size']
+        self.char_to_idx = tokenizer_data['char_to_idx']
+        self.idx_to_char = {int(k): v for k, v in tokenizer_data['idx_to_char'].items()}
+        print(f"\nTokenizer loaded from: {filepath}")
+        print(f"Vocabulary size: {self.vocab_size}")
+        return self
+    def get_stats(self):
+        """Print tokenizer statistics"""
+        print("\n" + "="*80)
+        print("TOKENIZER STATISTICS")
+        print("="*80)
+        print(f"Type: Character-level")
+        print(f"Vocabulary size: {self.vocab_size}")
+        print(f"Sample characters: {list(self.char_to_idx.keys())[:20]}")
+        print("="*80)
+def main():
+    """Main function to build and test tokenizer"""
+    print("\n" + "="*80)
+    print("TOKENIZER BUILDER")
+    print("="*80)
+    # Load dataset
+    dataset_file = 'data/tiny_shakespeare.txt'
+    if not os.path.exists(dataset_file):
+        print(f"\nError: Dataset not found at {dataset_file}")
+        print("Please run dataset_loader.py first.")
+        return
+    print(f"\nLoading text from: {dataset_file}")
+    with open(dataset_file, 'r', encoding='utf-8') as f:
+        text = f.read()
+    print(f"Loaded {len(text):,} characters")
+    # Build tokenizer
+    tokenizer = CharacterTokenizer()
+    tokenizer.build_vocab(text)
+    # Test tokenizer
+    print("\n" + "="*80)
+    print("TESTING TOKENIZER")
+    print("="*80)
+    test_text = "Hello, World!"
+    print(f"\nOriginal text: {test_text}")
+    encoded = tokenizer.encode(test_text)
+    print(f"Encoded: {encoded}")
+    decoded = tokenizer.decode(encoded)
+    print(f"Decoded: {decoded}")
+    if test_text == decoded:
+        print("Test passed!")
+    else:
+        print("Test failed!")
+    # Test with Shakespeare sample
+    shakespeare_sample = text[:100]
+    print(f"\nShakespeare sample: {shakespeare_sample}")
+    encoded_sample = tokenizer.encode(shakespeare_sample)
+    print(f"Encoded (first 20 tokens): {encoded_sample[:20]}")
+    decoded_sample = tokenizer.decode(encoded_sample)
+    assert shakespeare_sample == decoded_sample, "Encoding/decoding mismatch!"
+    print("Shakespeare encoding test passed!")
+    # Show statistics
+    tokenizer.get_stats()
+    # Save tokenizer
+    tokenizer.save()
+    print("\n" + "="*80)
+    print("TOKENIZER BUILD COMPLETE")
+    print("="*80)
+    print(f"\nTokenizer ready for model training!")
+    print(f"Vocabulary size: {tokenizer.vocab_size}")
+    print(f"Saved to: models/tokenizer.json")
+    print(f"\nNext step: Build the model architecture")
+    print("="*80 + "\n")
+if __name__ == "__main__":
+    main()

models/s2_model.py ADDED Viewed

	@@ -0,0 +1,785 @@

+"""
+Llama-Style Transformer Model
+=============================
+Modern transformer architecture with all Tier 1 and Tier 2 optimizations:
+Architecture (Tier 1):
+    - RMSNorm (faster than LayerNorm, no mean calculation)
+    - RoPE (Rotary Position Embedding, better length generalization)
+    - SwiGLU activation (gated FFN, consistently outperforms GELU)
+    - Pre-norm (apply norm before attention/FFN, more stable training)
+Optimizations (Tier 2):
+    - GQA (Grouped Query Attention, fewer KV heads = faster + less memory)
+    - Weight tying (share embedding and output projection)
+    - Flash Attention via F.scaled_dot_product_attention
+    - Gradient checkpointing support (trade compute for memory)
+Compatible with:
+    - liger-kernel (fused RMSNorm, SwiGLU, RoPE, cross-entropy)
+    - bf16/fp16 mixed precision training
+    - torch.compile for additional speedups
+Model Sizes:
+    - tiny:   ~15M params (for testing)
+    - small:  ~125M params
+    - medium: ~350M params
+    - large:  ~760M params
+    - 1B:     ~1.1B params (Llama 3.2 1B style)
+"""
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ============================================================================
+# Model Configuration
+# ============================================================================
+@dataclass
+class ModelConfig:
+    """Configuration for Llama-style transformer model."""
+    # Model architecture
+    vocab_size: int = 32000
+    d_model: int = 2048          # Hidden dimension
+    n_layers: int = 16           # Number of transformer blocks
+    n_heads: int = 32            # Number of attention heads
+    n_kv_heads: int = 8          # Number of KV heads (for GQA)
+    d_ff: int = None             # FFN intermediate dim (default: 8/3 * d_model)
+    # Sequence
+    max_seq_len: int = 2048      # Maximum sequence length
+    # RoPE
+    rope_theta: float = 500000.0  # RoPE base frequency
+    # Regularization
+    dropout: float = 0.0          # Dropout (0 for pretraining)
+    # Options
+    tie_weights: bool = True      # Tie embedding and output weights
+    use_flash_attn: bool = True   # Use Flash Attention (SDPA)
+    def __post_init__(self):
+        # SwiGLU uses 8/3 * d_model for FFN, rounded to multiple of 256
+        if self.d_ff is None:
+            self.d_ff = int(8 / 3 * self.d_model)
+            self.d_ff = ((self.d_ff + 255) // 256) * 256
+        # Validate GQA configuration
+        assert self.n_heads % self.n_kv_heads == 0, \
+            f"n_heads ({self.n_heads}) must be divisible by n_kv_heads ({self.n_kv_heads})"
+        self.n_kv_groups = self.n_heads // self.n_kv_heads
+        self.head_dim = self.d_model // self.n_heads
+# Predefined model configurations
+MODEL_CONFIGS = {
+    "tiny": ModelConfig(
+        d_model=256,
+        n_layers=6,
+        n_heads=8,
+        n_kv_heads=4,
+        max_seq_len=1024,
+    ),
+    "small": ModelConfig(
+        d_model=768,
+        n_layers=12,
+        n_heads=12,
+        n_kv_heads=4,
+        max_seq_len=2048,
+    ),
+    "medium": ModelConfig(
+        d_model=1024,
+        n_layers=16,
+        n_heads=16,
+        n_kv_heads=4,
+        max_seq_len=2048,
+    ),
+    "large": ModelConfig(
+        d_model=1536,
+        n_layers=20,
+        n_heads=24,
+        n_kv_heads=8,
+        max_seq_len=2048,
+    ),
+    "1B": ModelConfig(
+        d_model=2048,
+        n_layers=16,
+        n_heads=32,
+        n_kv_heads=8,
+        d_ff=8192,  # Llama 3.2 1B uses 4x hidden, not 8/3x
+        max_seq_len=2048,
+    ),
+}
+def get_model_config(size: str, **overrides) -> ModelConfig:
+    """Get a predefined model configuration with optional overrides."""
+    if size not in MODEL_CONFIGS:
+        raise ValueError(f"Unknown model size: {size}. Choose from: {list(MODEL_CONFIGS.keys())}")
+    config = MODEL_CONFIGS[size]
+    # Apply overrides
+    for key, value in overrides.items():
+        if hasattr(config, key):
+            setattr(config, key, value)
+        else:
+            raise ValueError(f"Unknown config parameter: {key}")
+    # Recompute derived values
+    config.__post_init__()
+    return config
+# ============================================================================
+# RMSNorm (Tier 1)
+# ============================================================================
+class RMSNorm(nn.Module):
+    """
+    Root Mean Square Layer Normalization.
+    Simpler and faster than LayerNorm - skips the mean calculation.
+    Used in Llama, Mistral, and other modern LLMs.
+    Can be replaced with liger_kernel.transformers.LigerRMSNorm for
+    additional speedup via kernel fusion.
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+# ============================================================================
+# Rotary Position Embedding (RoPE) (Tier 1)
+# ============================================================================
+def precompute_rope_freqs(
+    dim: int,
+    max_seq_len: int,
+    theta: float = 10000.0,
+    device: torch.device = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precompute the cos and sin frequencies for RoPE.
+    Args:
+        dim: Head dimension (d_model // n_heads)
+        max_seq_len: Maximum sequence length
+        theta: Base frequency (Llama 3 uses 500000)
+        device: Target device
+    Returns:
+        cos, sin tensors of shape (max_seq_len, dim)
+    """
+    # Compute inverse frequencies
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device).float() / dim))
+    # Create position indices
+    t = torch.arange(max_seq_len, device=device)
+    # Outer product: (seq_len,) x (dim/2,) -> (seq_len, dim/2)
+    freqs = torch.outer(t, freqs)
+    # Compute cos and sin, then interleave to get (seq_len, dim)
+    cos = torch.cos(freqs).repeat_interleave(2, dim=-1)
+    sin = torch.sin(freqs).repeat_interleave(2, dim=-1)
+    return cos, sin
+def apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply rotary position embedding to input tensor.
+    Args:
+        x: Input tensor of shape (batch, n_heads, seq_len, head_dim)
+        cos: Cosine frequencies of shape (seq_len, head_dim)
+        sin: Sine frequencies of shape (seq_len, head_dim)
+    Returns:
+        Tensor with rotary embedding applied
+    """
+    # Get sequence length from input
+    seq_len = x.size(2)
+    cos = cos[:seq_len]
+    sin = sin[:seq_len]
+    # Reshape for broadcasting: (1, 1, seq_len, head_dim)
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    # Rotate pairs: [x0, x1, x2, x3, ...] -> [-x1, x0, -x3, x2, ...]
+    x_rot = torch.stack([-x[..., 1::2], x[..., ::2]], dim=-1)
+    x_rot = x_rot.reshape(x.shape)
+    # Apply rotation
+    return x * cos + x_rot * sin
+# ============================================================================
+# Grouped Query Attention (GQA) with Flash Attention (Tier 1 + Tier 2)
+# ============================================================================
+class Attention(nn.Module):
+    """
+    Multi-head attention with Grouped Query Attention (GQA) and Flash Attention.
+    GQA uses fewer key-value heads than query heads, reducing memory and
+    compute while maintaining quality. For example, with 32 query heads and
+    8 KV heads, each KV head is shared by 4 query heads.
+    Flash Attention is used via PyTorch's scaled_dot_product_attention,
+    which provides O(N) memory complexity instead of O(N^2).
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.n_heads = config.n_heads
+        self.n_kv_heads = config.n_kv_heads
+        self.n_kv_groups = config.n_kv_groups
+        self.head_dim = config.head_dim
+        # Query projection: full heads
+        self.wq = nn.Linear(config.d_model, config.n_heads * config.head_dim, bias=False)
+        # Key and Value projections: fewer heads for GQA
+        self.wk = nn.Linear(config.d_model, config.n_kv_heads * config.head_dim, bias=False)
+        self.wv = nn.Linear(config.d_model, config.n_kv_heads * config.head_dim, bias=False)
+        # Output projection
+        self.wo = nn.Linear(config.n_heads * config.head_dim, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+        self.use_flash_attn = config.use_flash_attn
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = x.shape
+        # Project to Q, K, V
+        q = self.wq(x)  # (B, T, n_heads * head_dim)
+        k = self.wk(x)  # (B, T, n_kv_heads * head_dim)
+        v = self.wv(x)  # (B, T, n_kv_heads * head_dim)
+        # Reshape to (B, n_heads, T, head_dim)
+        q = q.view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K
+        q = apply_rotary_emb(q, cos, sin)
+        k = apply_rotary_emb(k, cos, sin)
+        # Expand KV heads for GQA: (B, n_kv_heads, T, head_dim) -> (B, n_heads, T, head_dim)
+        if self.n_kv_groups > 1:
+            k = k.repeat_interleave(self.n_kv_groups, dim=1)
+            v = v.repeat_interleave(self.n_kv_groups, dim=1)
+        # Attention
+        if self.use_flash_attn:
+            # Use PyTorch's optimized SDPA (Flash Attention when available)
+            attn_out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=mask,
+                dropout_p=self.dropout.p if self.training else 0.0,
+                is_causal=mask is None,  # Use causal mask if no explicit mask
+            )
+        else:
+            # Manual attention (for debugging or when SDPA unavailable)
+            scale = 1.0 / math.sqrt(self.head_dim)
+            attn_weights = torch.matmul(q, k.transpose(-2, -1)) * scale
+            if mask is not None:
+                attn_weights = attn_weights + mask
+            else:
+                # Causal mask
+                causal_mask = torch.triu(
+                    torch.full((seq_len, seq_len), float('-inf'), device=x.device),
+                    diagonal=1
+                )
+                attn_weights = attn_weights + causal_mask
+            attn_weights = F.softmax(attn_weights, dim=-1)
+            attn_weights = self.dropout(attn_weights)
+            attn_out = torch.matmul(attn_weights, v)
+        # Reshape back: (B, n_heads, T, head_dim) -> (B, T, d_model)
+        attn_out = attn_out.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
+        return self.wo(attn_out)
+# ============================================================================
+# SwiGLU Feed-Forward Network (Tier 1)
+# ============================================================================
+class FeedForward(nn.Module):
+    """
+    SwiGLU Feed-Forward Network.
+    Replaces the standard GELU FFN with a gated linear unit using SiLU activation.
+    Uses 3 weight matrices (gate, up, down) instead of 2.
+    SwiGLU(x) = (x * W_gate * SiLU) * (x * W_up) * W_down
+    Consistently outperforms GELU at the same compute budget.
+    Can be replaced with liger_kernel.transformers.LigerSwiGLUMLP for fusion.
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        hidden_dim = config.d_ff
+        # Gate and up projections (can be fused)
+        self.w_gate = nn.Linear(config.d_model, hidden_dim, bias=False)
+        self.w_up = nn.Linear(config.d_model, hidden_dim, bias=False)
+        # Down projection
+        self.w_down = nn.Linear(hidden_dim, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # SwiGLU: SiLU(gate) * up, then project down
+        return self.dropout(self.w_down(F.silu(self.w_gate(x)) * self.w_up(x)))
+# ============================================================================
+# Transformer Block (Pre-norm)
+# ============================================================================
+class TransformerBlock(nn.Module):
+    """
+    Single transformer block with pre-norm architecture.
+    Pre-norm applies normalization BEFORE attention/FFN (not after),
+    which provides more stable gradients at scale.
+    Structure:
+        x = x + Attention(RMSNorm(x))
+        x = x + FFN(RMSNorm(x))
+    """
+    def __init__(self, config: ModelConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        # Pre-norm layers
+        self.attn_norm = RMSNorm(config.d_model)
+        self.ffn_norm = RMSNorm(config.d_model)
+        # Attention and FFN
+        self.attn = Attention(config)
+        self.ffn = FeedForward(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Pre-norm attention with residual
+        x = x + self.attn(self.attn_norm(x), cos, sin, mask)
+        # Pre-norm FFN with residual
+        x = x + self.ffn(self.ffn_norm(x))
+        return x
+# ============================================================================
+# Complete Llama Model
+# ============================================================================
+class LlamaModel(nn.Module):
+    """
+    Complete Llama-style transformer model for language modeling.
+    Features:
+        - RMSNorm, RoPE, SwiGLU, GQA (Tier 1)
+        - Weight tying, Flash Attention (Tier 2)
+        - Gradient checkpointing support
+        - Compatible with liger-kernel fused ops
+    Usage:
+        config = get_model_config("1B", vocab_size=32000)
+        model = LlamaModel(config)
+        # Enable gradient checkpointing for memory savings
+        model.gradient_checkpointing_enable()
+        # Forward pass
+        logits = model(input_ids)
+        loss = model(input_ids, targets=targets)
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        # Token embedding
+        self.tok_emb = nn.Embedding(config.vocab_size, config.d_model)
+        # Transformer blocks
+        self.layers = nn.ModuleList([
+            TransformerBlock(config, layer_idx=i)
+            for i in range(config.n_layers)
+        ])
+        # Final normalization
+        self.norm = RMSNorm(config.d_model)
+        # Output projection (language model head)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Weight tying: share embedding and output weights
+        if config.tie_weights:
+            self.lm_head.weight = self.tok_emb.weight
+        # Precompute RoPE frequencies
+        self.register_buffer(
+            "rope_cos",
+            torch.zeros(config.max_seq_len, config.head_dim),
+            persistent=False
+        )
+        self.register_buffer(
+            "rope_sin",
+            torch.zeros(config.max_seq_len, config.head_dim),
+            persistent=False
+        )
+        # Gradient checkpointing flag
+        self._gradient_checkpointing = False
+        # Initialize weights
+        self.apply(self._init_weights)
+        # Apply special initialization for output projection
+        self._init_output_weights()
+    def _init_weights(self, module: nn.Module):
+        """Initialize weights using Llama-style initialization."""
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def _init_output_weights(self):
+        """Apply scaled initialization to output projections for stability."""
+        # Scale down residual projections by 1/sqrt(2*n_layers)
+        scale = (2 * self.config.n_layers) ** -0.5
+        for layer in self.layers:
+            torch.nn.init.normal_(layer.attn.wo.weight, mean=0.0, std=0.02 * scale)
+            torch.nn.init.normal_(layer.ffn.w_down.weight, mean=0.0, std=0.02 * scale)
+    def _init_rope(self, device: torch.device):
+        """Initialize RoPE frequencies on the correct device."""
+        cos, sin = precompute_rope_freqs(
+            dim=self.config.head_dim,
+            max_seq_len=self.config.max_seq_len,
+            theta=self.config.rope_theta,
+            device=device,
+        )
+        self.rope_cos = cos
+        self.rope_sin = sin
+    def gradient_checkpointing_enable(self):
+        """Enable gradient checkpointing for memory-efficient training."""
+        self._gradient_checkpointing = True
+    def gradient_checkpointing_disable(self):
+        """Disable gradient checkpointing."""
+        self._gradient_checkpointing = False
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        targets: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass.
+        Args:
+            input_ids: Token IDs of shape (batch_size, seq_len)
+            targets: Optional target IDs for loss computation
+            mask: Optional attention mask
+        Returns:
+            If targets provided: scalar loss
+            Otherwise: logits of shape (batch_size, seq_len, vocab_size)
+        """
+        batch_size, seq_len = input_ids.shape
+        device = input_ids.device
+        # Initialize RoPE on first forward pass (ensures correct device)
+        if self.rope_cos.device != device or self.rope_cos.sum() == 0:
+            self._init_rope(device)
+        # Token embeddings
+        x = self.tok_emb(input_ids)
+        # Get RoPE frequencies for this sequence length
+        cos = self.rope_cos[:seq_len]
+        sin = self.rope_sin[:seq_len]
+        # Transformer blocks
+        for layer in self.layers:
+            if self._gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(
+                    layer, x, cos, sin, mask,
+                    use_reentrant=False
+                )
+            else:
+                x = layer(x, cos, sin, mask)
+        # Final norm
+        x = self.norm(x)
+        # Compute logits
+        logits = self.lm_head(x)
+        # Compute loss if targets provided
+        if targets is not None:
+            # NOTE: No shift here — the DataLoader already provides
+            # pre-shifted targets (x = tokens[:-1], y = tokens[1:]),
+            # so logits[k] should predict targets[k] directly.
+            loss = F.cross_entropy(
+                logits.view(-1, self.config.vocab_size),
+                targets.view(-1),
+                ignore_index=-100,  # Ignore padding
+            )
+            return loss
+        return logits
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int = 100,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+    ) -> torch.Tensor:
+        """
+        Generate tokens autoregressively.
+        Args:
+            input_ids: Starting token IDs (batch_size, seq_len)
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature (1.0 = neutral)
+            top_k: If set, only sample from top k tokens
+            top_p: If set, use nucleus sampling with this probability mass
+        Returns:
+            Generated token IDs (batch_size, seq_len + max_new_tokens)
+        """
+        self.eval()
+        for _ in range(max_new_tokens):
+            # Crop to max_seq_len if needed
+            idx_cond = input_ids if input_ids.size(1) <= self.config.max_seq_len else \
+                       input_ids[:, -self.config.max_seq_len:]
+            # Forward pass
+            logits = self(idx_cond)
+            # Get logits for last position
+            logits = logits[:, -1, :] / temperature
+            # Apply top-k filtering
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = float('-inf')
+            # Apply top-p (nucleus) filtering
+            if top_p is not None:
+                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                # Remove tokens with cumulative probability above threshold
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices_to_remove.scatter(
+                    1, sorted_indices, sorted_indices_to_remove
+                )
+                logits[indices_to_remove] = float('-inf')
+            # Sample
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            # Append
+            input_ids = torch.cat([input_ids, next_token], dim=1)
+        return input_ids
+    def count_parameters(self, trainable_only: bool = True) -> int:
+        """Count model parameters."""
+        if trainable_only:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return sum(p.numel() for p in self.parameters())
+    def estimate_flops(self, seq_len: int, batch_size: int = 1) -> int:
+        """
+        Estimate FLOPs for a forward pass.
+        Uses the approximation: FLOPs ≈ 2 * params * tokens
+        (multiply-add counts as 2 ops)
+        """
+        params = self.count_parameters(trainable_only=False)
+        tokens = batch_size * seq_len
+        return 2 * params * tokens
+# ============================================================================
+# Utility Functions
+# ============================================================================
+def create_model(
+    size: str = "1B",
+    vocab_size: int = 32000,
+    max_seq_len: int = 2048,
+    **kwargs
+) -> LlamaModel:
+    """
+    Create a Llama model with the specified configuration.
+    Args:
+        size: Model size ("tiny", "small", "medium", "large", "1B")
+        vocab_size: Vocabulary size
+        max_seq_len: Maximum sequence length
+        **kwargs: Additional config overrides
+    Returns:
+        Initialized LlamaModel
+    """
+    config = get_model_config(
+        size,
+        vocab_size=vocab_size,
+        max_seq_len=max_seq_len,
+        **kwargs
+    )
+    return LlamaModel(config)
+def print_model_summary(model: LlamaModel):
+    """Print a summary of the model architecture."""
+    config = model.config
+    params = model.count_parameters()
+    print("\n" + "=" * 60)
+    print("LLAMA MODEL SUMMARY")
+    print("=" * 60)
+    print(f"\nArchitecture:")
+    print(f"  Hidden dim:      {config.d_model}")
+    print(f"  Layers:          {config.n_layers}")
+    print(f"  Attention heads: {config.n_heads}")
+    print(f"  KV heads (GQA):  {config.n_kv_heads}")
+    print(f"  Head dim:        {config.head_dim}")
+    print(f"  FFN dim:         {config.d_ff}")
+    print(f"  Vocab size:      {config.vocab_size}")
+    print(f"  Max seq len:     {config.max_seq_len}")
+    print(f"\nOptimizations:")
+    print(f"  RMSNorm:         Yes")
+    print(f"  RoPE:            Yes (theta={config.rope_theta})")
+    print(f"  SwiGLU:          Yes")
+    print(f"  GQA:             Yes ({config.n_heads}/{config.n_kv_heads} = {config.n_kv_groups}x)")
+    print(f"  Weight tying:    {config.tie_weights}")
+    print(f"  Flash Attention: {config.use_flash_attn}")
+    print(f"\nParameters:")
+    print(f"  Total:           {params:,}")
+    print(f"  Size:            ~{params / 1e9:.2f}B" if params > 1e9 else f"  Size:            ~{params / 1e6:.0f}M")
+    # Estimate memory
+    param_bytes = params * 4  # fp32
+    print(f"  FP32 memory:     ~{param_bytes / 1e9:.2f} GB")
+    print(f"  BF16 memory:     ~{param_bytes / 2 / 1e9:.2f} GB")
+    print("=" * 60 + "\n")
+# ============================================================================
+# Main (for testing)
+# ============================================================================
+if __name__ == "__main__":
+    # Test model creation
+    print("Testing Llama model creation...\n")
+    for size in ["tiny", "small", "medium", "large", "1B"]:
+        model = create_model(size)
+        params = model.count_parameters()
+        print(f"{size:8s}: {params:>12,} parameters ({params/1e6:>7.1f}M)")
+    print("\n" + "-" * 60)
+    # Detailed summary for 1B
+    model = create_model("1B")
+    print_model_summary(model)
+    # Test forward pass
+    print("Testing forward pass...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    batch_size = 2
+    seq_len = 128
+    input_ids = torch.randint(0, 32000, (batch_size, seq_len), device=device)
+    # Forward without targets (returns logits)
+    logits = model(input_ids)
+    print(f"Logits shape: {logits.shape}")
+    # Forward with targets (returns loss)
+    targets = torch.randint(0, 32000, (batch_size, seq_len), device=device)
+    loss = model(input_ids, targets=targets)
+    print(f"Loss: {loss.item():.4f}")
+    # Test gradient checkpointing
+    print("\nTesting gradient checkpointing...")
+    model.gradient_checkpointing_enable()
+    loss = model(input_ids, targets=targets)
+    loss.backward()
+    print(f"Gradient checkpointing loss: {loss.item():.4f}")
+    print("\nAll tests passed!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch>=2.0.0
+tokenizers>=0.13.0
+gradio>=4.0.0
+numpy

tokenizer/bpe_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff