faizack commited on
Commit
8794ddf
·
verified ·
1 Parent(s): 0100f41

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +97 -0
  2. config.json +13 -0
  3. modeling_gpt2_custom.py +323 -0
  4. pytorch_model.bin +3 -0
  5. tokenizer_config.json +9 -0
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # bayes_mini
2
+
3
+ `bayes_mini` is a custom GPT-2 (124M) language model trained from scratch on ~20 GB of English Wikipedia data.
4
+
5
+ ## Architecture
6
+
7
+ - Based on GPT-2 small (124M parameters)
8
+ - 12 layers, 12 attention heads
9
+ - Hidden size: 768
10
+ - Context length: 1024
11
+ - Vocabulary size: 50257
12
+ - Dropout: 0.1
13
+
14
+ ## Training Configuration
15
+
16
+ - Dataset: Cleaned English Wikipedia (~20 GB)
17
+ - Architecture: GPT-2 Small (124M parameters)
18
+ - Optimizer settings: `Foundation better_quality`
19
+ - Hardware: NVIDIA GeForce RTX 4060 (8 GB VRAM)
20
+ - Epochs: 50
21
+ - Batch size: 4 (gradient accumulation steps: 8 -> effective batch size: 32)
22
+ - Learning rate: 2e-4
23
+ - Warmup steps: 2000
24
+ - Weight decay: 0.01
25
+
26
+
27
+ ## Install required packages
28
+ ```bash
29
+ pip install torch transformers tiktoken huggingface_hub
30
+ ```
31
+
32
+ ## Example Usage
33
+
34
+ ```python
35
+ import os
36
+ import torch
37
+ import json
38
+ import tiktoken
39
+ import importlib.util
40
+ from huggingface_hub import hf_hub_download
41
+
42
+ # === CONFIG ===
43
+ REPO_ID = "faizack/bayes_mini_custom"
44
+
45
+ # === Step 1: Download necessary files ===
46
+ config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
47
+ model_path = hf_hub_download(repo_id=REPO_ID, filename="pytorch_model.bin")
48
+ modeling_path = hf_hub_download(repo_id=REPO_ID, filename="modeling_gpt2_custom.py")
49
+
50
+ # === Step 2: Dynamically import modeling_gpt2_custom.py ===
51
+ spec = importlib.util.spec_from_file_location("modeling_gpt2_custom", modeling_path)
52
+ mod = importlib.util.module_from_spec(spec)
53
+ spec.loader.exec_module(mod)
54
+ GPTModel = mod.GPTModel # Now you can use GPTModel
55
+
56
+ # === Step 3: Load config ===
57
+ with open(config_path, "r") as f:
58
+ config = json.load(f)
59
+
60
+ model_config = {
61
+ "vocab_size": config["vocab_size"],
62
+ "context_length": config["n_positions"],
63
+ "emb_dim": config["n_embd"],
64
+ "n_heads": config["n_head"],
65
+ "n_layers": config["n_layer"],
66
+ "drop_rate": config["dropout"],
67
+ "qkv_bias": config["qkv_bias"],
68
+ }
69
+
70
+ # === Step 4: Load tokenizer ===
71
+ tokenizer = tiktoken.get_encoding("gpt2")
72
+ prompt = "The rise of artificial intelligence"
73
+ input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
74
+
75
+ # === Step 5: Load model ===
76
+ model = GPTModel(model_config)
77
+ model.load_state_dict(torch.load(model_path, map_location="cpu"))
78
+ model.eval()
79
+
80
+
81
+ # === Step 6: Generate ===
82
+ def generate(model, idx, max_new_tokens=50):
83
+ for _ in range(max_new_tokens):
84
+ idx_cond = idx[:, -model_config["context_length"] :]
85
+ with torch.no_grad():
86
+ logits = model(idx_cond)
87
+ logits = logits[:, -1, :]
88
+ probs = torch.softmax(logits, dim=-1)
89
+ next_token = torch.multinomial(probs, num_samples=1)
90
+ idx = torch.cat([idx, next_token], dim=1)
91
+ return idx
92
+
93
+
94
+ output = generate(model, input_ids)
95
+ print(tokenizer.decode(output[0].tolist()))
96
+
97
+ ```
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GPTModel"
4
+ ],
5
+ "model_type": "gpt2",
6
+ "vocab_size": 50257,
7
+ "n_positions": 1024,
8
+ "n_embd": 768,
9
+ "n_layer": 12,
10
+ "n_head": 12,
11
+ "dropout": 0.1,
12
+ "qkv_bias": true
13
+ }
modeling_gpt2_custom.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.utils.data import Dataset, DataLoader
5
+
6
+
7
+ class GPTDatasetV1(Dataset):
8
+ def __init__(self, txt, tokenizer, max_length, stride):
9
+ self.input_ids = []
10
+ self.target_ids = []
11
+
12
+ # Check if txt is a generator (streaming mode)
13
+ if hasattr(txt, "__iter__") and not isinstance(txt, (str, bytes)):
14
+ # Process data in chunks for streamed input
15
+ all_tokens = []
16
+ for chunk in txt:
17
+ if isinstance(chunk, str):
18
+ chunk_tokens = tokenizer.encode(
19
+ chunk, allowed_special={"<|endoftext|>"}
20
+ )
21
+ all_tokens.extend(chunk_tokens)
22
+
23
+ # Process accumulated tokens when we have enough for at least one sequence
24
+ while len(all_tokens) >= max_length + 1:
25
+ input_chunk = all_tokens[:max_length]
26
+ target_chunk = all_tokens[1 : max_length + 1]
27
+ self.input_ids.append(torch.tensor(input_chunk))
28
+ self.target_ids.append(torch.tensor(target_chunk))
29
+
30
+ # Remove processed tokens with stride
31
+ all_tokens = all_tokens[stride:]
32
+ else:
33
+ # Original implementation for string input
34
+ token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
35
+
36
+ # Use a sliding window to chunk the book into overlapping sequences of max_length
37
+ for i in range(0, len(token_ids) - max_length, stride):
38
+ input_chunk = token_ids[i : i + max_length]
39
+ target_chunk = token_ids[i + 1 : i + max_length + 1]
40
+ self.input_ids.append(torch.tensor(input_chunk))
41
+ self.target_ids.append(torch.tensor(target_chunk))
42
+
43
+ def __len__(self):
44
+ return len(self.input_ids)
45
+
46
+ def __getitem__(self, idx):
47
+ return self.input_ids[idx], self.target_ids[idx]
48
+
49
+
50
+ def create_dataloader_v1(
51
+ txt,
52
+ batch_size=4,
53
+ max_length=256,
54
+ stride=128,
55
+ shuffle=True,
56
+ drop_last=True,
57
+ num_workers=0,
58
+ ):
59
+ # Initialize the tokenizer
60
+ tokenizer = tiktoken.get_encoding("gpt2")
61
+
62
+ # Create dataset
63
+ dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
64
+
65
+ # Create dataloader
66
+ dataloader = DataLoader(
67
+ dataset,
68
+ batch_size=batch_size,
69
+ shuffle=shuffle,
70
+ drop_last=drop_last,
71
+ num_workers=num_workers,
72
+ )
73
+
74
+ return dataloader
75
+
76
+
77
+ class MultiHeadAttention(nn.Module):
78
+ def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
79
+ super().__init__()
80
+ assert d_out % num_heads == 0, "d_out must be divisible by n_heads"
81
+
82
+ self.d_out = d_out
83
+ self.num_heads = num_heads
84
+ self.head_dim = (
85
+ d_out // num_heads
86
+ ) # Reduce the projection dim to match desired output dim
87
+
88
+ self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
89
+ self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
90
+ self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
91
+ self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
92
+ self.dropout = nn.Dropout(dropout)
93
+ self.register_buffer(
94
+ "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)
95
+ )
96
+
97
+ def forward(self, x):
98
+ b, num_tokens, d_in = x.shape
99
+
100
+ keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
101
+ queries = self.W_query(x)
102
+ values = self.W_value(x)
103
+
104
+ # We implicitly split the matrix by adding a `num_heads` dimension
105
+ # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
106
+ keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
107
+ values = values.view(b, num_tokens, self.num_heads, self.head_dim)
108
+ queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
109
+
110
+ # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
111
+ keys = keys.transpose(1, 2)
112
+ queries = queries.transpose(1, 2)
113
+ values = values.transpose(1, 2)
114
+
115
+ # Compute scaled dot-product attention (aka self-attention) with a causal mask
116
+ attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head
117
+
118
+ # Original mask truncated to the number of tokens and converted to boolean
119
+ mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
120
+
121
+ # Use the mask to fill attention scores
122
+ attn_scores.masked_fill_(mask_bool, -torch.inf)
123
+
124
+ attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
125
+ attn_weights = self.dropout(attn_weights)
126
+
127
+ # Shape: (b, num_tokens, num_heads, head_dim)
128
+ context_vec = (attn_weights @ values).transpose(1, 2)
129
+
130
+ # Combine heads, where self.d_out = self.num_heads * self.head_dim
131
+ context_vec = context_vec.reshape(b, num_tokens, self.d_out)
132
+ context_vec = self.out_proj(context_vec) # optional projection
133
+
134
+ return context_vec
135
+
136
+
137
+ class LayerNorm(nn.Module):
138
+ def __init__(self, emb_dim):
139
+ super().__init__()
140
+ self.eps = 1e-5
141
+ self.scale = nn.Parameter(torch.ones(emb_dim))
142
+ self.shift = nn.Parameter(torch.zeros(emb_dim))
143
+
144
+ def forward(self, x):
145
+ mean = x.mean(dim=-1, keepdim=True)
146
+ var = x.var(dim=-1, keepdim=True, unbiased=False)
147
+ norm_x = (x - mean) / torch.sqrt(var + self.eps)
148
+ return self.scale * norm_x + self.shift
149
+
150
+
151
+ class GELU(nn.Module):
152
+ def __init__(self):
153
+ super().__init__()
154
+
155
+ def forward(self, x):
156
+ return (
157
+ 0.5
158
+ * x
159
+ * (
160
+ 1
161
+ + torch.tanh(
162
+ torch.sqrt(torch.tensor(2.0 / torch.pi))
163
+ * (x + 0.044715 * torch.pow(x, 3))
164
+ )
165
+ )
166
+ )
167
+
168
+
169
+ class FeedForward(nn.Module):
170
+ def __init__(self, cfg):
171
+ super().__init__()
172
+ self.layers = nn.Sequential(
173
+ nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
174
+ GELU(),
175
+ nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
176
+ )
177
+
178
+ def forward(self, x):
179
+ return self.layers(x)
180
+
181
+
182
+ class TransformerBlock(nn.Module):
183
+ def __init__(self, cfg):
184
+ super().__init__()
185
+ self.att = MultiHeadAttention(
186
+ d_in=cfg["emb_dim"],
187
+ d_out=cfg["emb_dim"],
188
+ context_length=cfg["context_length"],
189
+ num_heads=cfg["n_heads"],
190
+ dropout=cfg["drop_rate"],
191
+ qkv_bias=cfg["qkv_bias"],
192
+ )
193
+ self.ff = FeedForward(cfg)
194
+ self.norm1 = LayerNorm(cfg["emb_dim"])
195
+ self.norm2 = LayerNorm(cfg["emb_dim"])
196
+ self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
197
+
198
+ def forward(self, x):
199
+ # Shortcut connection for attention block
200
+ shortcut = x
201
+ x = self.norm1(x)
202
+ x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
203
+ x = self.drop_shortcut(x)
204
+ x = x + shortcut # Add the original input back
205
+
206
+ # Shortcut connection for feed-forward block
207
+ shortcut = x
208
+ x = self.norm2(x)
209
+ x = self.ff(x)
210
+ x = self.drop_shortcut(x)
211
+ x = x + shortcut # Add the original input back
212
+
213
+ return x
214
+
215
+
216
+ class GPTModel(nn.Module):
217
+ def __init__(self, cfg):
218
+ super().__init__()
219
+ self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
220
+ self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
221
+ self.drop_emb = nn.Dropout(cfg["drop_rate"])
222
+
223
+ self.trf_blocks = nn.Sequential(
224
+ *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
225
+ )
226
+
227
+ self.final_norm = LayerNorm(cfg["emb_dim"])
228
+ self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
229
+
230
+ def forward(self, in_idx):
231
+ batch_size, seq_len = in_idx.shape
232
+ tok_embeds = self.tok_emb(in_idx)
233
+ pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
234
+ x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
235
+ x = self.drop_emb(x)
236
+ x = self.trf_blocks(x)
237
+ x = self.final_norm(x)
238
+ logits = self.out_head(x)
239
+ return logits
240
+
241
+
242
+ import torch.nn.functional as F
243
+
244
+
245
+ def generate_text_simple(
246
+ model,
247
+ idx,
248
+ max_new_tokens: int,
249
+ context_size: int,
250
+ temperature=1.0,
251
+ stream=False,
252
+ tokenizer=None,
253
+ ):
254
+ """
255
+ If stream=True: return a generator that yields decoded tokens one at a time.
256
+ If stream=False: return the full generated tensor.
257
+ """
258
+ if tokenizer is None:
259
+ raise ValueError("Tokenizer must be provided for decoding.")
260
+
261
+ def _gen():
262
+ nonlocal idx
263
+ for _ in range(max_new_tokens):
264
+ idx_cond = idx[:, -context_size:]
265
+ with torch.no_grad():
266
+ logits = model(idx_cond)
267
+ logits = logits[:, -1, :] / temperature
268
+ probs = F.softmax(logits, dim=-1)
269
+ idx_next = torch.multinomial(probs, num_samples=1)
270
+ idx = torch.cat((idx, idx_next), dim=1)
271
+ yield tokenizer.decode(idx_next[0].tolist())
272
+
273
+ if stream:
274
+ return _gen()
275
+ else:
276
+ from loguru import logger
277
+
278
+ logger.info("stream=False")
279
+ # run through generator silently, but collect idx
280
+ for _ in _gen():
281
+ pass
282
+ return idx
283
+
284
+
285
+ if __name__ == "__main__":
286
+
287
+ GPT_CONFIG_124M = {
288
+ "vocab_size": 50257, # Vocabulary size
289
+ "context_length": 1024, # Context length
290
+ "emb_dim": 768, # Embedding dimension
291
+ "n_heads": 12, # Number of attention heads
292
+ "n_layers": 12, # Number of layers
293
+ "drop_rate": 0.1, # Dropout rate
294
+ "qkv_bias": False, # Query-Key-Value bias
295
+ }
296
+
297
+ torch.manual_seed(123)
298
+ model = GPTModel(GPT_CONFIG_124M)
299
+ model.eval() # disable dropout
300
+
301
+ start_context = "Hello, I am"
302
+
303
+ tokenizer = tiktoken.get_encoding("gpt2")
304
+ encoded = tokenizer.encode(start_context)
305
+ encoded_tensor = torch.tensor(encoded).unsqueeze(0)
306
+
307
+ print(f"\n{50*'='}\n{22*' '}IN\n{50*'='}")
308
+ print("\nInput text:", start_context)
309
+ print("Encoded input text:", encoded)
310
+ print("encoded_tensor.shape:", encoded_tensor.shape)
311
+
312
+ out = generate_text_simple(
313
+ model=model,
314
+ idx=encoded_tensor,
315
+ max_new_tokens=10,
316
+ context_size=GPT_CONFIG_124M["context_length"],
317
+ )
318
+ decoded_text = tokenizer.decode(out.squeeze(0).tolist())
319
+
320
+ print(f"\n\n{50*'='}\n{22*' '}OUT\n{50*'='}")
321
+ print("\nOutput:", out)
322
+ print("Output length:", len(out[0]))
323
+ print("Output text:", decoded_text)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:384c50d29354dbc9ff3eac3795e8c5c9b0a4bffb4bf8de5184c38ac3f1cbe847
3
+ size 702554279
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "TiktokenTokenizer",
3
+ "auto_map": {
4
+ "AutoTokenizer": "tiktoken.TiktokenTokenizer"
5
+ },
6
+ "tiktoken_encoding": "gpt2",
7
+ "add_bos_token": false,
8
+ "add_eos_token": false
9
+ }