Arko007 commited on
Commit
9bb1019
·
verified ·
1 Parent(s): b902ad5

Upload Nano-GPT 42M (99,000 iterations, val_loss=3.0801)

Browse files
Files changed (7) hide show
  1. LICENSE +1 -0
  2. README.md +3 -0
  3. best_model.pt +3 -0
  4. config.json +9 -0
  5. config.py +73 -0
  6. model.py +171 -0
  7. tokenizer.json +0 -0
LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+ LICENSE placeholder.
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Model Files Only
2
+
3
+ README content will be added manually.
best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e8051f16d9e53421d857f5d8939eec72f9d014f21f8ecb6cbdbdfc07f43265
3
+ size 510910761
config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 32000,
3
+ "n_layers": 8,
4
+ "n_heads": 8,
5
+ "n_embd": 512,
6
+ "block_size": 512,
7
+ "dropout": 0.1,
8
+ "bias": true
9
+ }
config.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Nano-GPT Configuration
3
+ L4-SAFE: Reduced memory usage
4
+ """
5
+
6
+ import torch
7
+ from dataclasses import dataclass
8
+
9
+ @dataclass
10
+ class NanoGPTConfig:
11
+ # Model Architecture
12
+ vocab_size: int = 32000
13
+ n_layers: int = 8 # REDUCED from 12
14
+ n_heads: int = 8 # REDUCED from 12
15
+ n_embd: int = 512 # REDUCED from 768
16
+ block_size: int = 512 # REDUCED from 1024 (KEY!)
17
+ dropout: float = 0.1
18
+ bias: bool = True
19
+
20
+ # Training Hyperparameters
21
+ batch_size: int = 16 # REDUCED from 32 (KEY!)
22
+ gradient_accumulation_steps: int = 8 # INCREASED from 4
23
+ learning_rate: float = 3e-4
24
+ max_iters: int = 100000
25
+ weight_decay: float = 0.1
26
+ beta1: float = 0.9
27
+ beta2: float = 0.95
28
+ grad_clip: float = 1.0
29
+
30
+ # Learning Rate Scheduling
31
+ decay_lr: bool = True
32
+ warmup_iters: int = 2000
33
+ lr_decay_iters: int = 100000
34
+ min_lr: float = 3e-5
35
+
36
+ # Evaluation & Logging
37
+ eval_interval: int = 1000
38
+ eval_iters: int = 100 # REDUCED from 200
39
+ log_interval: int = 100
40
+
41
+ # Checkpointing
42
+ save_interval: int = 5000
43
+ checkpoint_dir: str = "checkpoints"
44
+
45
+ # Data
46
+ dataset_mix: dict = None
47
+
48
+ # Hardware
49
+ device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
50
+ dtype: str = 'bfloat16'
51
+ compile: bool = False # DISABLED torch.compile (uses more memory!)
52
+
53
+ # Reproducibility
54
+ seed: int = 42
55
+
56
+ def __post_init__(self):
57
+ if self.dataset_mix is None:
58
+ self.dataset_mix = {
59
+ 'fineweb': 1.0
60
+ }
61
+
62
+ @property
63
+ def n_params(self):
64
+ return (2 * self.vocab_size * self.n_embd +
65
+ 12 * self.n_layers * self.n_embd * self.n_embd) / 1e6
66
+
67
+ config = NanoGPTConfig()
68
+
69
+ if __name__ == "__main__":
70
+ print(f"Model size: ~{config.n_params:.1f}M parameters")
71
+ print(f"Sequence length: {config.block_size}")
72
+ print(f"Batch size: {config.batch_size}")
73
+ print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}")
model.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Nano-GPT: GPT-2 style decoder-only transformer
3
+ From scratch implementation
4
+ """
5
+
6
+ import math
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+ from config import config
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+ """Multi-head causal self-attention"""
14
+
15
+ def __init__(self, config):
16
+ super().__init__()
17
+ assert config.n_embd % config.n_heads == 0
18
+
19
+ # Key, Query, Value for all heads
20
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
21
+ # Output projection
22
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
23
+
24
+ # Regularization
25
+ self.attn_dropout = nn.Dropout(config.dropout)
26
+ self.resid_dropout = nn.Dropout(config.dropout)
27
+
28
+ self.n_heads = config.n_heads
29
+ self.n_embd = config.n_embd
30
+ self.dropout = config.dropout
31
+
32
+ # Causal mask
33
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
34
+ .view(1, 1, config.block_size, config.block_size))
35
+
36
+ def forward(self, x):
37
+ B, T, C = x.size() # batch, sequence, embedding
38
+
39
+ # Calculate Q, K, V
40
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
41
+ k = k.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
42
+ q = q.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
43
+ v = v.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
44
+
45
+ # Attention
46
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
47
+ att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
48
+ att = F.softmax(att, dim=-1)
49
+ att = self.attn_dropout(att)
50
+ y = att @ v
51
+
52
+ # Reassemble heads
53
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
54
+
55
+ # Output projection
56
+ y = self.resid_dropout(self.c_proj(y))
57
+ return y
58
+
59
+ class MLP(nn.Module):
60
+ """Feed-forward network"""
61
+
62
+ def __init__(self, config):
63
+ super().__init__()
64
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
65
+ self.gelu = nn.GELU()
66
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
67
+ self.dropout = nn.Dropout(config.dropout)
68
+
69
+ def forward(self, x):
70
+ x = self.c_fc(x)
71
+ x = self.gelu(x)
72
+ x = self.c_proj(x)
73
+ x = self.dropout(x)
74
+ return x
75
+
76
+ class Block(nn.Module):
77
+ """Transformer block"""
78
+
79
+ def __init__(self, config):
80
+ super().__init__()
81
+ self.ln_1 = nn.LayerNorm(config.n_embd)
82
+ self.attn = CausalSelfAttention(config)
83
+ self.ln_2 = nn.LayerNorm(config.n_embd)
84
+ self.mlp = MLP(config)
85
+
86
+ def forward(self, x):
87
+ x = x + self.attn(self.ln_1(x))
88
+ x = x + self.mlp(self.ln_2(x))
89
+ return x
90
+
91
+ class NanoGPT(nn.Module):
92
+ """Nano-GPT Model"""
93
+
94
+ def __init__(self, config):
95
+ super().__init__()
96
+ self.config = config
97
+
98
+ self.transformer = nn.ModuleDict(dict(
99
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
100
+ wpe = nn.Embedding(config.block_size, config.n_embd),
101
+ drop = nn.Dropout(config.dropout),
102
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
103
+ ln_f = nn.LayerNorm(config.n_embd),
104
+ ))
105
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
106
+
107
+ # Weight tying
108
+ self.transformer.wte.weight = self.lm_head.weight
109
+
110
+ # Initialize weights
111
+ self.apply(self._init_weights)
112
+ for pn, p in self.named_parameters():
113
+ if pn.endswith('c_proj.weight'):
114
+ torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layers))
115
+
116
+ print(f"Number of parameters: {self.get_num_params()/1e6:.2f}M")
117
+
118
+ def get_num_params(self):
119
+ return sum(p.numel() for p in self.parameters())
120
+
121
+ def _init_weights(self, module):
122
+ if isinstance(module, nn.Linear):
123
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
124
+ if module.bias is not None:
125
+ torch.nn.init.zeros_(module.bias)
126
+ elif isinstance(module, nn.Embedding):
127
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
128
+
129
+ def forward(self, idx, targets=None):
130
+ device = idx.device
131
+ b, t = idx.size()
132
+ assert t <= self.config.block_size
133
+
134
+ # Embeddings
135
+ pos = torch.arange(0, t, dtype=torch.long, device=device)
136
+ tok_emb = self.transformer.wte(idx)
137
+ pos_emb = self.transformer.wpe(pos)
138
+ x = self.transformer.drop(tok_emb + pos_emb)
139
+
140
+ # Transformer blocks
141
+ for block in self.transformer.h:
142
+ x = block(x)
143
+ x = self.transformer.ln_f(x)
144
+
145
+ # Language model head
146
+ if targets is not None:
147
+ logits = self.lm_head(x)
148
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
149
+ else:
150
+ logits = self.lm_head(x[:, [-1], :])
151
+ loss = None
152
+
153
+ return logits, loss
154
+
155
+ @torch.no_grad()
156
+ def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
157
+ """Generate text"""
158
+ for _ in range(max_new_tokens):
159
+ idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
160
+ logits, _ = self(idx_cond)
161
+ logits = logits[:, -1, :] / temperature
162
+
163
+ if top_k is not None:
164
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
165
+ logits[logits < v[:, [-1]]] = -float('Inf')
166
+
167
+ probs = F.softmax(logits, dim=-1)
168
+ idx_next = torch.multinomial(probs, num_samples=1)
169
+ idx = torch.cat((idx, idx_next), dim=1)
170
+
171
+ return idx
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff