nishantup
/

nanogpt-slm-tinystories-instruct

+"""
+Prepared by: Dr. Nishant Upadhyay
+nanoGPT SLM TinyStories Instruct -- Standalone Inference
+==========================================================
+124M parameter instruction-tuned Small Language Model.
+Pretrained on TinyStories (2.1M children's stories) -> SFT on 300K multi-source instructions.
+Dataset: 300K instruction dataset (Alpaca + Dolly + UltraChat + OpenAssistant + FLAN)
+Format:  Unified Task / Question / Answer prompt format
+Install:  pip install torch tiktoken huggingface_hub
+Run:      python nanogpt_slm_tinystories_instruct_inference.py
+Import:   from nanogpt_slm_tinystories_instruct_inference import ask
+"""
+import torch, torch.nn as nn, torch.nn.functional as F, math, tiktoken
+from dataclasses import dataclass
+from huggingface_hub import hf_hub_download
+# ==============================================================
+#  ARCHITECTURE
+# ==============================================================
+class LayerNorm(nn.Module):
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, x):
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn  = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj  = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout  = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head, self.n_embd = config.n_head, config.n_embd
+        self.flash = hasattr(F, 'scaled_dot_product_attention')
+        if not self.flash:
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                       .view(1, 1, config.block_size, config.block_size))
+    def forward(self, x):
+        B, T, C = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        if self.flash:
+            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None,
+                    dropout_p=self.attn_dropout.p if self.training else 0.0, is_causal=True)
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1); att = self.attn_dropout(att); y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.resid_dropout(self.c_proj(y))
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc   = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu   = nn.GELU()
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1, self.attn = LayerNorm(config.n_embd, config.bias), CausalSelfAttention(config)
+        self.ln2, self.mlp  = LayerNorm(config.n_embd, config.bias), MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        return x + self.mlp(self.ln2(x))
+@dataclass
+class GPTConfig:
+    block_size: int = 512;  vocab_size: int = 50257
+    n_layer: int = 12;      n_head: int = 12;     n_embd: int = 768
+    dropout: float = 0.0;   bias: bool = True
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size, config.n_embd),
+            wpe=nn.Embedding(config.block_size, config.n_embd),
+            drop=nn.Dropout(config.dropout),
+            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=LayerNorm(config.n_embd, config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight
+    def forward(self, idx, targets=None):
+        b, t = idx.size()
+        pos = torch.arange(0, t, dtype=torch.long, device=idx.device)
+        x = self.transformer.drop(self.transformer.wte(idx) + self.transformer.wpe(pos))
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            logits = self.lm_head(x)
+            return logits, F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            return self.lm_head(x[:, [-1], :]), None
+# ==============================================================
+#  GENERATION + PROMPT FORMATTING
+# ==============================================================
+def generate(model, idx, max_new_tokens, context_size, temperature=0.7, top_k=40, eos_id=None):
+    for _ in range(max_new_tokens):
+        idx_cond = idx[:, -context_size:]
+        with torch.no_grad():
+            logits, _ = model(idx_cond)
+        logits = logits[:, -1, :]
+        if top_k is not None:
+            v, _ = torch.topk(logits, top_k)
+            logits = torch.where(logits < v[:, -1], torch.tensor(float("-inf")).to(logits.device), logits)
+        if temperature > 0.0:
+            probs = torch.softmax(logits / temperature, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+        else:
+            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
+        if idx_next == eos_id:
+            break
+        idx = torch.cat((idx, idx_next), dim=1)
+    return idx
+def format_input(entry):
+    parts = [f"Task: {entry['instruction']}"]
+    if entry.get('input', '').strip():
+        parts.append(f"Question:\n{entry['input']}")
+    return '\n\n'.join(parts)
+def ask(instruction, input_text="", max_tokens=256, temperature=0.7, top_k=40):
+    """Ask the instruction-tuned model and get a response."""
+    prompt = format_input({"instruction": instruction, "input": input_text})
+    idx = torch.tensor(tokenizer.encode(prompt, allowed_special={'<|endoftext|>'})
+                       ).unsqueeze(0).to(device)
+    out = generate(model, idx, max_tokens, config.block_size, temperature, top_k, eos_id=50256)
+    return tokenizer.decode(out.squeeze(0).tolist())[len(prompt):].replace("Answer:", "").strip()
+# ==============================================================
+#  LOAD MODEL
+# ==============================================================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+config = GPTConfig()
+tokenizer = tiktoken.get_encoding("gpt2")
+weights_path = hf_hub_download(repo_id="nishantup/nanogpt-slm-tinystories-instruct",
+                                filename="nanogpt_slm_tinystories_instruct.pth")
+model = GPT(config)
+model.load_state_dict(torch.load(weights_path, map_location=device))
+model.to(device)
+model.eval()
+print(f"nanoGPT SLM TinyStories Instruct loaded: {sum(p.numel() for p in model.parameters()):,} params on {device}")
+print(f"Config: {config.n_layer}L / {config.n_head}H / {config.n_embd}D / ctx={config.block_size}")
+print(f"Format: Task / Question / Answer\n")
+# ==============================================================
+#  EXAMPLES
+# ==============================================================
+if __name__ == "__main__":
+    examples = [
+        ("What is the capital of France?", ""),
+        ("Explain gravity in simple terms.", ""),
+        ("Summarize the following text.",
+         "Machine learning enables systems to learn from data rather than being explicitly programmed."),
+        ("List three benefits of reading books.", ""),
+        ("Write a short poem about the stars.", ""),
+    ]
+    for instruction, inp in examples:
+        response = ask(instruction, inp)
+        print(f"Instruction: {instruction}")
+        if inp:
+            print(f"Input:       {inp[:80]}...")
+        print(f"Response:    {response}")
+        print(f"{'-' * 60}\n")