Nithinvalluri nishantup commited on
Commit
ffdd7af
·
0 Parent(s):

Duplicate from nishantup/nanogpt-slm-instruct

Browse files

Co-authored-by: Dr. NISHANT UPADHYAY <nishantup@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - pytorch
5
+ - nanogpt
6
+ - instruction-tuning
7
+ - sft
8
+ - slm
9
+ - from-scratch
10
+ ---
11
+
12
+ # nanoGPT SLM Instruct -- 123.849984 Million Parameters
13
+
14
+ Instruction fine-tuned Small Language Model, trained from scratch -> pretrained on 133 classic english fiction books -> SFT on Alpaca-format instructions.
15
+
16
+ ## Quick Start
17
+
18
+ ### Option 1: Run directly (downloads model + runs 5 examples)
19
+ ```bash
20
+ pip install torch tiktoken huggingface_hub
21
+ python nanogpt_slm_instruct_inference.py
22
+ ```
23
+
24
+ ### Option 2: Import and use `ask()` in your own code
25
+ ```python
26
+ # Import loads the model automatically (one-time download from HuggingFace)
27
+ from nanogpt_slm_instruct_inference import ask
28
+
29
+ ## First time execution will O/P prefed 5 examples with model responses
30
+ # Simple question
31
+ print(ask("What is the capital of France?"))
32
+ print()
33
+ # With input context
34
+ print(ask(
35
+ instruction="Summarize the following text.",
36
+ input_text="Machine learning enables systems to learn from data rather than being explicitly programmed."
37
+ ))
38
+ print()
39
+ # Control generation
40
+ print(ask(
41
+ "Write a short poem about the ocean.",
42
+ temperature=1.0, # higher = more creative
43
+ top_k=100, # wider sampling pool
44
+ max_tokens=150 # longer output
45
+ ))
46
+ print()
47
+ ```
48
+
49
+ ### Option 3: Load weights manually
50
+ ```python
51
+ from huggingface_hub import hf_hub_download
52
+ import torch, tiktoken
53
+
54
+ repo_id= "nishantup/nanogpt-slm-instruct"
55
+ filename = "nanogpt_slm_instruct.pth"
56
+
57
+ model_path = hf_hub_download(repo_id=repo_id, filename=filename)
58
+
59
+ # Build model (full architecture in nanogpt_slm_instruct_inference.py)
60
+ from nanogpt_slm_instruct_inference import GPT, GPTConfig, generate, format_input
61
+
62
+ config = GPTConfig()
63
+ model = GPT(config)
64
+ model.load_state_dict(torch.load(model_path, map_location="cpu"))
65
+ model.eval()
66
+ ```
67
+
68
+ ## Model Details
69
+
70
+ | Attribute | Value |
71
+ |:---|:---|
72
+ | Parameters | 123.849984 |
73
+ | Architecture | nanoGPT (12 layers, 12 heads, 768 dim) |
74
+ | Context length | 256 tokens |
75
+ | Tokenizer | tiktoken GPT-2 BPE (50,257 tokens) |
76
+ | Fine-tuning | Supervised (Alpaca format) |
77
+ | Framework | PyTorch |
78
+
79
+ ## Prompt Format
80
+
81
+ ```
82
+ Below is an instruction that describes a task.
83
+
84
+ ### Instruction:
85
+ {instruction}
86
+
87
+ ### Response:
88
+ ```
89
+
90
+ With optional input:
91
+ ```
92
+ Below is an instruction that describes a task, paired with further context.
93
+
94
+ ### Instruction:
95
+ {instruction}
96
+
97
+ ### Input:
98
+ {input}
99
+
100
+ ### Response:
101
+ ```
102
+
103
+ ## Files
104
+
105
+ | File | Description |
106
+ |:---|:---|
107
+ | `nanogpt_slm_instruct.pth` | SFT fine-tuned weights |
108
+ | `nanogpt_slm_instruct_inference.py` | Standalone inference script -- import and call `ask()` |
109
+ | `config.json` | Model configuration |
110
+
111
+ ## `ask()` API Reference
112
+
113
+ ```python
114
+ ask(instruction, input_text="", max_tokens=256, temperature=0.7, top_k=40)
115
+ ```
116
+
117
+ | Parameter | Default | Description |
118
+ |:---|:---|:---|
119
+ | `instruction` | (required) | The task instruction |
120
+ | `input_text` | `""` | Optional additional context |
121
+ | `max_tokens` | `256` | Maximum tokens to generate |
122
+ | `temperature` | `0.7` | 0.0 = greedy, 0.7 = balanced, 1.5 = creative |
123
+ | `top_k` | `40` | Top-k filtering (None = no filtering) |
config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "nanoGPT (custom, trained from scratch)",
3
+ "model_type": "instruction-tuned (SFT)",
4
+ "model_config": {
5
+ "block_size": 256,
6
+ "vocab_size": 50257,
7
+ "n_layer": 12,
8
+ "n_head": 12,
9
+ "n_embd": 768,
10
+ "dropout": 0.0,
11
+ "bias": true
12
+ },
13
+ "total_parameters": 123.849984,
14
+ "tokenizer": "tiktoken gpt2 (50,257 BPE tokens)",
15
+ "framework": "PyTorch",
16
+ "prompt_format": "Alpaca (### Instruction / ### Input / ### Response)"
17
+ }
nanogpt_slm_instruct.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55dab5d2c3f943476c6b7f2d68580d8a348b48e2d41342d82711b1ebd5e822ab
3
+ size 495457705
nanogpt_slm_instruct_inference.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prepared by: Dr. Nishant Upadhyay
3
+
4
+ nanoGPT SLM Instruct -- Standalone Inference
5
+ =============================================
6
+ 124M parameter instruction-tuned Small Language Model.
7
+ Trained from scratch -> Pretrained on 133 English fiction books -> SFT on Alpaca-format instructions.
8
+
9
+ Install: pip install torch tiktoken huggingface_hub
10
+ Run: python nanogpt_slm_instruct_inference.py
11
+ """
12
+
13
+ import torch, torch.nn as nn, torch.nn.functional as F, math, tiktoken
14
+ from dataclasses import dataclass
15
+ from huggingface_hub import hf_hub_download
16
+
17
+ # ==============================================================
18
+ # ARCHITECTURE
19
+ # ==============================================================
20
+
21
+ class LayerNorm(nn.Module):
22
+ def __init__(self, ndim, bias):
23
+ super().__init__()
24
+ self.weight = nn.Parameter(torch.ones(ndim))
25
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
26
+ def forward(self, x):
27
+ return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
28
+
29
+ class CausalSelfAttention(nn.Module):
30
+ def __init__(self, config):
31
+ super().__init__()
32
+ assert config.n_embd % config.n_head == 0
33
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
34
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
35
+ self.attn_dropout = nn.Dropout(config.dropout)
36
+ self.resid_dropout = nn.Dropout(config.dropout)
37
+ self.n_head, self.n_embd = config.n_head, config.n_embd
38
+ self.flash = hasattr(F, 'scaled_dot_product_attention')
39
+ if not self.flash:
40
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
41
+ .view(1, 1, config.block_size, config.block_size))
42
+ def forward(self, x):
43
+ B, T, C = x.size()
44
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
45
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
46
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
47
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
48
+ if self.flash:
49
+ y = F.scaled_dot_product_attention(q, k, v, attn_mask=None,
50
+ dropout_p=self.attn_dropout.p if self.training else 0.0, is_causal=True)
51
+ else:
52
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
53
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
54
+ att = F.softmax(att, dim=-1); att = self.attn_dropout(att); y = att @ v
55
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
56
+ return self.resid_dropout(self.c_proj(y))
57
+
58
+ class MLP(nn.Module):
59
+ def __init__(self, config):
60
+ super().__init__()
61
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
62
+ self.gelu = nn.GELU()
63
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
64
+ self.dropout = nn.Dropout(config.dropout)
65
+ def forward(self, x):
66
+ return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
67
+
68
+ class Block(nn.Module):
69
+ def __init__(self, config):
70
+ super().__init__()
71
+ self.ln1, self.attn = LayerNorm(config.n_embd, config.bias), CausalSelfAttention(config)
72
+ self.ln2, self.mlp = LayerNorm(config.n_embd, config.bias), MLP(config)
73
+ def forward(self, x):
74
+ x = x + self.attn(self.ln1(x))
75
+ return x + self.mlp(self.ln2(x))
76
+
77
+ @dataclass
78
+ class GPTConfig:
79
+ block_size: int = 256; vocab_size: int = 50257
80
+ n_layer: int = 12; n_head: int = 12; n_embd: int = 768
81
+ dropout: float = 0.0; bias: bool = True
82
+
83
+ class GPT(nn.Module):
84
+ def __init__(self, config):
85
+ super().__init__()
86
+ self.config = config
87
+ self.transformer = nn.ModuleDict(dict(
88
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
89
+ wpe=nn.Embedding(config.block_size, config.n_embd),
90
+ drop=nn.Dropout(config.dropout),
91
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
92
+ ln_f=LayerNorm(config.n_embd, config.bias),
93
+ ))
94
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
95
+ self.transformer.wte.weight = self.lm_head.weight # weight tying
96
+
97
+ def forward(self, idx, targets=None):
98
+ b, t = idx.size()
99
+ pos = torch.arange(0, t, dtype=torch.long, device=idx.device)
100
+ x = self.transformer.drop(self.transformer.wte(idx) + self.transformer.wpe(pos))
101
+ for block in self.transformer.h:
102
+ x = block(x)
103
+ x = self.transformer.ln_f(x)
104
+ if targets is not None:
105
+ logits = self.lm_head(x)
106
+ return logits, F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
107
+ else:
108
+ return self.lm_head(x[:, [-1], :]), None
109
+
110
+ # ==============================================================
111
+ # GENERATION + PROMPT FORMATTING
112
+ # ==============================================================
113
+
114
+ def generate(model, idx, max_new_tokens, context_size, temperature=0.7, top_k=40, eos_id=None):
115
+ for _ in range(max_new_tokens):
116
+ idx_cond = idx[:, -context_size:]
117
+ with torch.no_grad():
118
+ logits, _ = model(idx_cond)
119
+ logits = logits[:, -1, :]
120
+ if top_k is not None:
121
+ v, _ = torch.topk(logits, top_k)
122
+ logits = torch.where(logits < v[:, -1], torch.tensor(float("-inf")).to(logits.device), logits)
123
+ if temperature > 0.0:
124
+ probs = torch.softmax(logits / temperature, dim=-1)
125
+ idx_next = torch.multinomial(probs, num_samples=1)
126
+ else:
127
+ idx_next = torch.argmax(logits, dim=-1, keepdim=True)
128
+ if idx_next == eos_id:
129
+ break
130
+ idx = torch.cat((idx, idx_next), dim=1)
131
+ return idx
132
+
133
+ def format_input(entry):
134
+ text = (f"Below is an instruction that describes a task. "
135
+ f"Write a response that appropriately completes the request."
136
+ f"\n\n### Instruction:\n{entry['instruction']}")
137
+ if entry.get("input"):
138
+ text += f"\n\n### Input:\n{entry['input']}"
139
+ return text
140
+
141
+ def ask(instruction, input_text="", max_tokens=256, temperature=0.7, top_k=40):
142
+ """Ask the instruction-tuned model and get a response."""
143
+ prompt = format_input({"instruction": instruction, "input": input_text})
144
+ idx = torch.tensor(tokenizer.encode(prompt, allowed_special={'<|endoftext|>'})
145
+ ).unsqueeze(0).to(device)
146
+ out = generate(model, idx, max_tokens, config.block_size, temperature, top_k, eos_id=50256)
147
+ return tokenizer.decode(out.squeeze(0).tolist())[len(prompt):].replace("### Response:", "").strip()
148
+
149
+ # ==============================================================
150
+ # LOAD MODEL (auto-downloads from HuggingFace Hub)
151
+ # ==============================================================
152
+
153
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
154
+ config = GPTConfig()
155
+ tokenizer = tiktoken.get_encoding("gpt2")
156
+
157
+ weights_path = hf_hub_download(repo_id="nishantup/nanogpt-slm-instruct",
158
+ filename="nanogpt_slm_instruct.pth")
159
+ model = GPT(config)
160
+ model.load_state_dict(torch.load(weights_path, map_location=device))
161
+ model.to(device)
162
+ model.eval()
163
+
164
+ print(f"nanoGPT SLM Instruct loaded: {sum(p.numel() for p in model.parameters()):,} params on {device}")
165
+ print(f"Config: {config.n_layer}L / {config.n_head}H / {config.n_embd}D / ctx={config.block_size}\n")
166
+
167
+ # ==============================================================
168
+ # EXAMPLES
169
+ # ==============================================================
170
+
171
+ examples = [
172
+ ("What is the capital of France?", ""),
173
+ ("Explain gravity in simple terms.", ""),
174
+ ("Summarize the following text.",
175
+ "Machine learning enables systems to learn from data rather than being explicitly programmed."),
176
+ ("List three benefits of reading books.", ""),
177
+ ("Write a short poem about the stars.", ""),
178
+ ]
179
+
180
+ for instruction, inp in examples:
181
+ response = ask(instruction, inp)
182
+ print(f"Instruction: {instruction}")
183
+ if inp:
184
+ print(f"Input: {inp[:80]}...")
185
+ print(f"Response: {response}")
186
+ print(f"{'-' * 60}\n")