Ronakparmar commited on
Commit
3f81034
·
verified ·
1 Parent(s): 28c85e9

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +202 -0
README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Leap0 Model
2
+
3
+ ### Model Description
4
+
5
+ This is the Leap0 model, designed for text generation tasks. It leverages the GPT-2 tokenizer and architecture but is specifically trained on the Tiny Stories dataset.
6
+
7
+ ## Model Architecture
8
+
9
+ - **Model Type**: GPT-2
10
+ - **Number of Layers**: 8
11
+ - **Number of Heads**: 8
12
+ - **Embedding Size**: 768
13
+ - **Block Size**: 768
14
+ - **Vocabulary Size**: 50257
15
+ - **Dropout Rate**: 0.1
16
+ - **Attention Mechanism**: Causal Self-Attention
17
+ - **Encoding**: GPT-2 Tokenizer
18
+
19
+ ## Training Details
20
+
21
+ - **Dataset**: Tiny Stories
22
+
23
+ ## How to Use
24
+ # change the input as per your desired string
25
+
26
+ """
27
+ import torch
28
+ import json
29
+ from transformers import GPT2Tokenizer
30
+ from safetensors.torch import load_file
31
+ import os
32
+ import math
33
+ import time
34
+ import inspect
35
+ from dataclasses import dataclass
36
+ import torch
37
+ import torch.nn as nn
38
+ from torch.nn import functional as F
39
+ from datasets import load_dataset
40
+
41
+ # Load the dataset
42
+ dataset = load_dataset("hellaswag", trust_remote_code=True)
43
+ print(dataset)
44
+
45
+ # Define the CausalSelfAttention class
46
+ class CausalSelfAttention(nn.Module):
47
+ def __init__(self, config):
48
+ super().__init__()
49
+ assert config.n_embd % config.n_head == 0
50
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
51
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
52
+ self.c_proj.NANOGPT_SCALE_INIT = 1
53
+ self.n_head = config.n_head
54
+ self.n_embd = config.n_embd
55
+
56
+ def forward(self, x):
57
+ B, T, C = x.size()
58
+ qkv = self.c_attn(x)
59
+ q, k, v = qkv.split(self.n_embd, dim=2)
60
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
61
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
62
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
63
+ y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
64
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
65
+ y = self.c_proj(y)
66
+ return y
67
+
68
+ # Define the MLP class
69
+ class MLP(nn.Module):
70
+ def __init__(self, config):
71
+ super().__init__()
72
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
73
+ self.gelu = nn.GELU(approximate='tanh')
74
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
75
+ self.c_proj.NANOGPT_SCALE_INIT = 1
76
+
77
+ def forward(self, x):
78
+ x = self.c_fc(x)
79
+ x = self.gelu(x)
80
+ x = self.c_proj(x)
81
+ return x
82
+
83
+ # Define the Block class
84
+ class Block(nn.Module):
85
+ def __init__(self, config):
86
+ super().__init__()
87
+ self.ln_1 = nn.LayerNorm(config.n_embd)
88
+ self.attn = CausalSelfAttention(config)
89
+ self.ln_2 = nn.LayerNorm(config.n_embd)
90
+ self.mlp = MLP(config)
91
+
92
+ def forward(self, x):
93
+ x = x + self.attn(self.ln_1(x))
94
+ x = x + self.mlp(self.ln_2(x))
95
+ return x
96
+
97
+ # Define the GPTConfig class
98
+ @dataclass
99
+ class GPTConfig:
100
+ block_size: int = 768
101
+ vocab_size: int = 50257
102
+ n_layer: int = 8
103
+ n_head: int = 8
104
+ n_embd: int = 768
105
+ dropout: float = 0.1
106
+ model_type: str = "custom_gpt"
107
+
108
+ def to_dict(self):
109
+ return self.__dict__
110
+
111
+ @classmethod
112
+ def from_dict(cls, config_dict):
113
+ return cls(**config_dict)
114
+
115
+ # Define the GPT class
116
+ class GPT(nn.Module):
117
+ def __init__(self, config):
118
+ super().__init__()
119
+ self.config = config
120
+
121
+ self.transformer = nn.ModuleDict(dict(
122
+ wte=nn.Embedding(config.vocab_size, config.n_embd),
123
+ wpe=nn.Embedding(config.block_size, config.n_embd),
124
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
125
+ ln_f=nn.LayerNorm(config.n_embd),
126
+ ))
127
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
128
+
129
+ # Weight sharing scheme
130
+ self.transformer.wte.weight = self.lm_head.weight
131
+
132
+ # Initialize parameters
133
+ self.apply(self._init_weights)
134
+
135
+ def _init_weights(self, module):
136
+ if isinstance(module, nn.Linear):
137
+ std = 0.02
138
+ if hasattr(module, 'NANOGPT_SCALE_INIT'):
139
+ std *= (2 * self.config.n_layer) ** -0.5
140
+ torch.nn.init.normal_(module.weight, mean=0.0, std=std)
141
+ if module.bias is not None:
142
+ torch.nn.init.zeros_(module.bias)
143
+ elif isinstance(module, nn.Embedding):
144
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
145
+
146
+ def forward(self, idx, targets=None):
147
+ B, T = idx.size()
148
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
149
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
150
+ pos_emb = self.transformer.wpe(pos)
151
+ tok_emb = self.transformer.wte(idx)
152
+ x = tok_emb + pos_emb
153
+ for block in self.transformer.h:
154
+ x = block(x)
155
+ x = self.transformer.ln_f(x)
156
+ logits = self.lm_head(x)
157
+ loss = None
158
+ if targets is not None:
159
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
160
+ return logits, loss
161
+
162
+ # Manually specify the paths to the config and model files
163
+ config_path = "/home/nll-workstation/Desktop/config.json"
164
+ model_path = "/home/nll-workstation/Desktop/model.safetensors"
165
+
166
+ # Load the configuration from the specified JSON file
167
+ with open(config_path, "r") as f:
168
+ config_dict = json.load(f)
169
+ config = GPTConfig.from_dict(config_dict)
170
+
171
+ # Load the model weights from the specified .safetensors file
172
+ tensors = load_file(model_path)
173
+
174
+ # Instantiate the model with the loaded config
175
+ model = GPT(config)
176
+
177
+ # Load the state dict (weights) into the model
178
+ model.load_state_dict(tensors, strict=False)
179
+
180
+ # Set the model to evaluation mode
181
+ model.eval()
182
+
183
+ # Load the tokenizer (same tokenizer used during training)
184
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
185
+
186
+ # Prepare input text and tokenize it
187
+ input_text = "once upon a time in the village of "
188
+ input_ids = tokenizer.encode(input_text, return_tensors="pt")
189
+
190
+ # Run inference (forward pass) through the model
191
+ logits, _ = model(input_ids) # Forward pass, extract logits from the tuple
192
+
193
+ # Get predicted token IDs by taking the argmax of logits
194
+ predicted_ids = torch.argmax(logits, dim=-1)
195
+
196
+ # Convert predicted token IDs to text
197
+ output_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
198
+
199
+ # Print input and output
200
+ print("Input Text:", input_text)
201
+ print("Output Text:", output_text)
202
+ """