Upload model and tokenizer
Browse files- config.json +1 -0
- modeling_gpt.py +17 -9
config.json
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"block_size": 1024,
|
| 3 |
"n_embd": 768,
|
| 4 |
"n_head": 12,
|
|
|
|
| 1 |
{
|
| 2 |
+
"model_type": "llama",
|
| 3 |
"block_size": 1024,
|
| 4 |
"n_embd": 768,
|
| 5 |
"n_head": 12,
|
modeling_gpt.py
CHANGED
|
@@ -69,6 +69,7 @@ class FeedFoward(nn.Module): #yeh MLP hai karpathy wala -> Feed forward hai se
|
|
| 69 |
return x
|
| 70 |
""" a simple linear layer followed by a non-linearity """
|
| 71 |
|
|
|
|
| 72 |
class Block(nn.Module):
|
| 73 |
""" Transformer block: communication followed by computation """
|
| 74 |
|
|
@@ -87,22 +88,22 @@ class Block(nn.Module):
|
|
| 87 |
|
| 88 |
class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
|
| 89 |
|
| 90 |
-
def __init__(self, vocab_size=
|
| 91 |
super().__init__()
|
| 92 |
print("This is vocab size:", vocab_size)
|
| 93 |
-
self.token_embedding_table = nn.Embedding(vocab_size,
|
| 94 |
-
self.position_embedding_table = nn.Embedding(
|
| 95 |
self.blocks = nn.Sequential(
|
| 96 |
-
*[Block(
|
| 97 |
)
|
| 98 |
-
self.ln_f = nn.LayerNorm(
|
| 99 |
-
self.lm_head = nn.Linear(
|
| 100 |
|
| 101 |
self.token_embedding_table.weight = self.lm_head.weight
|
| 102 |
|
| 103 |
self.apply(self._init_weights)
|
| 104 |
-
self.config = {"BLOCK_SIZE":
|
| 105 |
-
|
| 106 |
|
| 107 |
def _init_weights(self, module):
|
| 108 |
if isinstance(module, nn.Linear):
|
|
@@ -115,6 +116,8 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
|
|
| 115 |
elif isinstance(module, nn.Embedding):
|
| 116 |
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 117 |
|
|
|
|
|
|
|
| 118 |
def forward(self, idx, targets=None):
|
| 119 |
B, T = idx.shape
|
| 120 |
assert T <= BLOCK_SIZE, f"Cannot forward sequence of length {T}, block size is only {BLOCK_SIZE}"
|
|
@@ -175,11 +178,13 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
|
|
| 175 |
# Load the state dict
|
| 176 |
state_dict = torch.load(path)["model"]
|
| 177 |
|
|
|
|
| 178 |
new_state_dict = {}
|
| 179 |
for key, value in state_dict.items():
|
| 180 |
new_key = key.replace('_orig_mod.', '') # Remove 'orig_mod.' prefix
|
| 181 |
new_state_dict[new_key] = value
|
| 182 |
|
|
|
|
| 183 |
self.load_state_dict(new_state_dict)
|
| 184 |
|
| 185 |
|
|
@@ -197,4 +202,7 @@ class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
|
|
| 197 |
use_fused = fused_available and device == "cuda"
|
| 198 |
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused = use_fused)
|
| 199 |
return optimizer
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
return x
|
| 70 |
""" a simple linear layer followed by a non-linearity """
|
| 71 |
|
| 72 |
+
|
| 73 |
class Block(nn.Module):
|
| 74 |
""" Transformer block: communication followed by computation """
|
| 75 |
|
|
|
|
| 88 |
|
| 89 |
class GPTLanguageModel(nn.Module, PyTorchModelHubMixin):
|
| 90 |
|
| 91 |
+
def __init__(self, vocab_size = 32000, block_size=1024, n_embd=768, n_head=12, n_layer=12):
|
| 92 |
super().__init__()
|
| 93 |
print("This is vocab size:", vocab_size)
|
| 94 |
+
self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
|
| 95 |
+
self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
|
| 96 |
self.blocks = nn.Sequential(
|
| 97 |
+
*[Block(N_EMBD, n_head=N_HEAD) for _ in range(N_LAYER)]
|
| 98 |
)
|
| 99 |
+
self.ln_f = nn.LayerNorm(N_EMBD)
|
| 100 |
+
self.lm_head = nn.Linear(N_EMBD, vocab_size)
|
| 101 |
|
| 102 |
self.token_embedding_table.weight = self.lm_head.weight
|
| 103 |
|
| 104 |
self.apply(self._init_weights)
|
| 105 |
+
self.config = {"BLOCK_SIZE": BLOCK_SIZE, "N_EMBD": N_EMBD, "N_HEAD":N_HEAD, "N_LAYER": N_LAYER}
|
| 106 |
+
|
| 107 |
|
| 108 |
def _init_weights(self, module):
|
| 109 |
if isinstance(module, nn.Linear):
|
|
|
|
| 116 |
elif isinstance(module, nn.Embedding):
|
| 117 |
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
| 118 |
|
| 119 |
+
|
| 120 |
+
|
| 121 |
def forward(self, idx, targets=None):
|
| 122 |
B, T = idx.shape
|
| 123 |
assert T <= BLOCK_SIZE, f"Cannot forward sequence of length {T}, block size is only {BLOCK_SIZE}"
|
|
|
|
| 178 |
# Load the state dict
|
| 179 |
state_dict = torch.load(path)["model"]
|
| 180 |
|
| 181 |
+
# Rename the keys to match the expected ones (remove "orig_mod." prefix)
|
| 182 |
new_state_dict = {}
|
| 183 |
for key, value in state_dict.items():
|
| 184 |
new_key = key.replace('_orig_mod.', '') # Remove 'orig_mod.' prefix
|
| 185 |
new_state_dict[new_key] = value
|
| 186 |
|
| 187 |
+
# Load the renamed state dict into the model
|
| 188 |
self.load_state_dict(new_state_dict)
|
| 189 |
|
| 190 |
|
|
|
|
| 202 |
use_fused = fused_available and device == "cuda"
|
| 203 |
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused = use_fused)
|
| 204 |
return optimizer
|
| 205 |
+
|
| 206 |
+
def load_optimizer(self, optimizer, path):
|
| 207 |
+
optimizer.load_state_dict(torch.load(path)["optimizer"])
|
| 208 |
+
return optimizer
|