Duplicate from nishantup/nanogpt-slm-instruct

Browse files

Co-authored-by: Dr. NISHANT UPADHYAY <nishantup@users.noreply.huggingface.co>

Files changed (5) hide show

.gitattributes +35 -0
README.md +123 -0
config.json +17 -0
nanogpt_slm_instruct.pth +3 -0
nanogpt_slm_instruct_inference.py +186 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+---
+license: mit
+tags:
+  - pytorch
+  - nanogpt
+  - instruction-tuning
+  - sft
+  - slm
+  - from-scratch
+---
+# nanoGPT SLM Instruct -- 123.849984 Million Parameters
+Instruction fine-tuned Small Language Model, trained from scratch -> pretrained on 133 classic english fiction books -> SFT on Alpaca-format instructions.
+## Quick Start
+### Option 1: Run directly (downloads model + runs 5 examples)
+```bash
+pip install torch tiktoken huggingface_hub
+python nanogpt_slm_instruct_inference.py
+```
+### Option 2: Import and use `ask()` in your own code
+```python
+# Import loads the model automatically (one-time download from HuggingFace)
+from nanogpt_slm_instruct_inference import ask
+## First time  execution will O/P prefed 5 examples with model responses
+# Simple question
+print(ask("What is the capital of France?"))
+print()
+# With input context
+print(ask(
+    instruction="Summarize the following text.",
+    input_text="Machine learning enables systems to learn from data rather than being explicitly programmed."
+))
+print()
+# Control generation
+print(ask(
+    "Write a short poem about the ocean.",
+    temperature=1.0,    # higher = more creative
+    top_k=100,          # wider sampling pool
+    max_tokens=150      # longer output
+))
+print()
+```
+### Option 3: Load weights manually
+```python
+from huggingface_hub import hf_hub_download
+import torch, tiktoken
+repo_id= "nishantup/nanogpt-slm-instruct"
+filename = "nanogpt_slm_instruct.pth"
+model_path = hf_hub_download(repo_id=repo_id, filename=filename)
+# Build model (full architecture in nanogpt_slm_instruct_inference.py)
+from nanogpt_slm_instruct_inference import GPT, GPTConfig, generate, format_input
+config = GPTConfig()
+model = GPT(config)
+model.load_state_dict(torch.load(model_path, map_location="cpu"))
+model.eval()
+```
+## Model Details
+| Attribute | Value |
+|:---|:---|
+| Parameters | 123.849984 |
+| Architecture | nanoGPT (12 layers, 12 heads, 768 dim) |
+| Context length | 256 tokens |
+| Tokenizer | tiktoken GPT-2 BPE (50,257 tokens) |
+| Fine-tuning | Supervised (Alpaca format) |
+| Framework | PyTorch |
+## Prompt Format
+```
+Below is an instruction that describes a task.
+### Instruction:
+{instruction}
+### Response:
+```
+With optional input:
+```
+Below is an instruction that describes a task, paired with further context.
+### Instruction:
+{instruction}
+### Input:
+{input}
+### Response:
+```
+## Files
+| File | Description |
+|:---|:---|
+| `nanogpt_slm_instruct.pth` | SFT fine-tuned weights |
+| `nanogpt_slm_instruct_inference.py` | Standalone inference script -- import and call `ask()` |
+| `config.json` | Model configuration |
+## `ask()` API Reference
+```python
+ask(instruction, input_text="", max_tokens=256, temperature=0.7, top_k=40)
+```
+| Parameter | Default | Description |
+|:---|:---|:---|
+| `instruction` | (required) | The task instruction |
+| `input_text` | `""` | Optional additional context |
+| `max_tokens` | `256` | Maximum tokens to generate |
+| `temperature` | `0.7` | 0.0 = greedy, 0.7 = balanced, 1.5 = creative |
+| `top_k` | `40` | Top-k filtering (None = no filtering) |

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "architecture": "nanoGPT (custom, trained from scratch)",
+  "model_type": "instruction-tuned (SFT)",
+  "model_config": {
+    "block_size": 256,
+    "vocab_size": 50257,
+    "n_layer": 12,
+    "n_head": 12,
+    "n_embd": 768,
+    "dropout": 0.0,
+    "bias": true
+  },
+  "total_parameters": 123.849984,
+  "tokenizer": "tiktoken gpt2 (50,257 BPE tokens)",
+  "framework": "PyTorch",
+  "prompt_format": "Alpaca (### Instruction / ### Input / ### Response)"
+}

nanogpt_slm_instruct.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55dab5d2c3f943476c6b7f2d68580d8a348b48e2d41342d82711b1ebd5e822ab
+size 495457705

nanogpt_slm_instruct_inference.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Prepared by: Dr. Nishant Upadhyay
+nanoGPT SLM Instruct -- Standalone Inference
+=============================================
+124M parameter instruction-tuned Small Language Model.
+Trained from scratch -> Pretrained on 133 English fiction books -> SFT on Alpaca-format instructions.
+Install:  pip install torch tiktoken huggingface_hub
+Run:      python nanogpt_slm_instruct_inference.py
+"""
+import torch, torch.nn as nn, torch.nn.functional as F, math, tiktoken
+from dataclasses import dataclass
+from huggingface_hub import hf_hub_download
+# ==============================================================
+#  ARCHITECTURE
+# ==============================================================
+class LayerNorm(nn.Module):
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, x):
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn  = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj  = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout  = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head, self.n_embd = config.n_head, config.n_embd
+        self.flash = hasattr(F, 'scaled_dot_product_attention')
+        if not self.flash:
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                       .view(1, 1, config.block_size, config.block_size))
+    def forward(self, x):
+        B, T, C = x.size()
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        if self.flash:
+            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None,
+                    dropout_p=self.attn_dropout.p if self.training else 0.0, is_causal=True)
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1); att = self.attn_dropout(att); y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.resid_dropout(self.c_proj(y))
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc   = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu   = nn.GELU()
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1, self.attn = LayerNorm(config.n_embd, config.bias), CausalSelfAttention(config)
+        self.ln2, self.mlp  = LayerNorm(config.n_embd, config.bias), MLP(config)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        return x + self.mlp(self.ln2(x))
+@dataclass
+class GPTConfig:
+    block_size: int = 256;  vocab_size: int = 50257
+    n_layer: int = 12;      n_head: int = 12;     n_embd: int = 768
+    dropout: float = 0.0;   bias: bool = True
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size, config.n_embd),
+            wpe=nn.Embedding(config.block_size, config.n_embd),
+            drop=nn.Dropout(config.dropout),
+            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=LayerNorm(config.n_embd, config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer.wte.weight = self.lm_head.weight  # weight tying
+    def forward(self, idx, targets=None):
+        b, t = idx.size()
+        pos = torch.arange(0, t, dtype=torch.long, device=idx.device)
+        x = self.transformer.drop(self.transformer.wte(idx) + self.transformer.wpe(pos))
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+        if targets is not None:
+            logits = self.lm_head(x)
+            return logits, F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            return self.lm_head(x[:, [-1], :]), None
+# ==============================================================
+#  GENERATION + PROMPT FORMATTING
+# ==============================================================
+def generate(model, idx, max_new_tokens, context_size, temperature=0.7, top_k=40, eos_id=None):
+    for _ in range(max_new_tokens):
+        idx_cond = idx[:, -context_size:]
+        with torch.no_grad():
+            logits, _ = model(idx_cond)
+        logits = logits[:, -1, :]
+        if top_k is not None:
+            v, _ = torch.topk(logits, top_k)
+            logits = torch.where(logits < v[:, -1], torch.tensor(float("-inf")).to(logits.device), logits)
+        if temperature > 0.0:
+            probs = torch.softmax(logits / temperature, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+        else:
+            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
+        if idx_next == eos_id:
+            break
+        idx = torch.cat((idx, idx_next), dim=1)
+    return idx
+def format_input(entry):
+    text = (f"Below is an instruction that describes a task. "
+            f"Write a response that appropriately completes the request."
+            f"\n\n### Instruction:\n{entry['instruction']}")
+    if entry.get("input"):
+        text += f"\n\n### Input:\n{entry['input']}"
+    return text
+def ask(instruction, input_text="", max_tokens=256, temperature=0.7, top_k=40):
+    """Ask the instruction-tuned model and get a response."""
+    prompt = format_input({"instruction": instruction, "input": input_text})
+    idx = torch.tensor(tokenizer.encode(prompt, allowed_special={'<|endoftext|>'})
+                       ).unsqueeze(0).to(device)
+    out = generate(model, idx, max_tokens, config.block_size, temperature, top_k, eos_id=50256)
+    return tokenizer.decode(out.squeeze(0).tolist())[len(prompt):].replace("### Response:", "").strip()
+# ==============================================================
+#  LOAD MODEL (auto-downloads from HuggingFace Hub)
+# ==============================================================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+config = GPTConfig()
+tokenizer = tiktoken.get_encoding("gpt2")
+weights_path = hf_hub_download(repo_id="nishantup/nanogpt-slm-instruct",
+                                filename="nanogpt_slm_instruct.pth")
+model = GPT(config)
+model.load_state_dict(torch.load(weights_path, map_location=device))
+model.to(device)
+model.eval()
+print(f"nanoGPT SLM Instruct loaded: {sum(p.numel() for p in model.parameters()):,} params on {device}")
+print(f"Config: {config.n_layer}L / {config.n_head}H / {config.n_embd}D / ctx={config.block_size}\n")
+# ==============================================================
+#  EXAMPLES
+# ==============================================================
+examples = [
+    ("What is the capital of France?", ""),
+    ("Explain gravity in simple terms.", ""),
+    ("Summarize the following text.",
+     "Machine learning enables systems to learn from data rather than being explicitly programmed."),
+    ("List three benefits of reading books.", ""),
+    ("Write a short poem about the stars.", ""),
+]
+for instruction, inp in examples:
+    response = ask(instruction, inp)
+    print(f"Instruction: {instruction}")
+    if inp:
+        print(f"Input:       {inp[:80]}...")
+    print(f"Response:    {response}")
+    print(f"{'-' * 60}\n")