refactor: Update pyproject.toml to include pytorch-gpu dependency and its source URL
Browse files- bad_gpt.py +32 -18
- dataset.py +2 -1
- poetry.lock +0 -0
- pyproject.toml +7 -1
bad_gpt.py
CHANGED
|
@@ -55,37 +55,43 @@ class BadGPTModel(nn.Module):
|
|
| 55 |
|
| 56 |
# Given a 2d matrix of dimensions token and sentence
|
| 57 |
# generate new tokens in the next sentence
|
| 58 |
-
def generate(self,
|
| 59 |
-
for
|
| 60 |
# Log progress so I don't go insane
|
| 61 |
-
if
|
| 62 |
-
logger.debug(f'Iteration {
|
| 63 |
# Crop out the last block_size tokens
|
| 64 |
-
|
| 65 |
-
logits = self(
|
| 66 |
# Logits has dimensions token, sentence, token_list
|
| 67 |
# We want to make a new sentence, so only look at the last sentence
|
| 68 |
logits = logits[:, -1, :]
|
| 69 |
# Get possible next tokens and select one
|
| 70 |
probabilities = F.softmax(logits, dim=-1)
|
| 71 |
-
|
| 72 |
# Add the new token to the end of the tensor
|
| 73 |
-
|
| 74 |
-
return
|
| 75 |
|
| 76 |
|
| 77 |
@torch.no_grad()
|
| 78 |
-
def estimate_loss(
|
| 79 |
out = {}
|
| 80 |
-
|
| 81 |
for split in ['train', 'val']:
|
| 82 |
losses = torch.zeros(eval_interval)
|
| 83 |
-
for
|
| 84 |
-
|
| 85 |
-
logits
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
out[split] = losses.mean()
|
| 88 |
-
|
| 89 |
return out
|
| 90 |
|
| 91 |
|
|
@@ -123,8 +129,14 @@ class BadGPTTrainer():
|
|
| 123 |
logger.debug(
|
| 124 |
f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
| 125 |
context_stack, answer_stack = self.batcher.get_batch(split='train')
|
| 126 |
-
|
| 127 |
self.device), answer_stack.to(self.device))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
self.optimizer.zero_grad(set_to_none=True)
|
| 129 |
loss.backward()
|
| 130 |
self.optimizer.step()
|
|
@@ -167,6 +179,8 @@ class BadGPT():
|
|
| 167 |
learning_rate=lr
|
| 168 |
)
|
| 169 |
self._trainer.train()
|
|
|
|
|
|
|
| 170 |
|
| 171 |
def generate(self, prompt: str, response_size: int):
|
| 172 |
start_ids = encode(prompt)
|
|
@@ -174,5 +188,5 @@ class BadGPT():
|
|
| 174 |
# add batch dimension. it's just 1 batch, but we still need it cuz tensors
|
| 175 |
context = context[None, ...]
|
| 176 |
encoded = self._model.generate(
|
| 177 |
-
|
| 178 |
return decode(encoded.tolist())
|
|
|
|
| 55 |
|
| 56 |
# Given a 2d matrix of dimensions token and sentence
|
| 57 |
# generate new tokens in the next sentence
|
| 58 |
+
def generate(self, ctx: torch.Tensor, max_new_tokens: int):
|
| 59 |
+
for index in range(max_new_tokens):
|
| 60 |
# Log progress so I don't go insane
|
| 61 |
+
if index % 16 == 0:
|
| 62 |
+
logger.debug(f'Iteration {index} of {max_new_tokens}')
|
| 63 |
# Crop out the last block_size tokens
|
| 64 |
+
cropped_ctx = ctx[:, -self.block_size:]
|
| 65 |
+
logits = self(cropped_ctx)
|
| 66 |
# Logits has dimensions token, sentence, token_list
|
| 67 |
# We want to make a new sentence, so only look at the last sentence
|
| 68 |
logits = logits[:, -1, :]
|
| 69 |
# Get possible next tokens and select one
|
| 70 |
probabilities = F.softmax(logits, dim=-1)
|
| 71 |
+
ctx_next = torch.multinomial(probabilities, num_samples=1)
|
| 72 |
# Add the new token to the end of the tensor
|
| 73 |
+
ctx = torch.cat((ctx, ctx_next), dim=1)
|
| 74 |
+
return ctx
|
| 75 |
|
| 76 |
|
| 77 |
@torch.no_grad()
|
| 78 |
+
def estimate_loss(gpt: BadGPTModel, batcher: Batcher, eval_interval: int, device: Literal['cuda', 'cpu'] = 'cuda'):
|
| 79 |
out = {}
|
| 80 |
+
gpt.eval()
|
| 81 |
for split in ['train', 'val']:
|
| 82 |
losses = torch.zeros(eval_interval)
|
| 83 |
+
for epoch in range(eval_interval):
|
| 84 |
+
train, answer = batcher.get_batch(split='train')
|
| 85 |
+
logits = gpt.forward(train)
|
| 86 |
+
# Reformat pediction and answer so each entry can be compared
|
| 87 |
+
batch, block, vocab = logits.shape
|
| 88 |
+
logits = logits.view(batch * block, vocab)
|
| 89 |
+
answer = answer.view(batch * block)
|
| 90 |
+
# Compare entropy of predicted tokens to actual
|
| 91 |
+
loss = F.cross_entropy(logits, answer).item()
|
| 92 |
+
losses[epoch] = loss
|
| 93 |
out[split] = losses.mean()
|
| 94 |
+
gpt.train()
|
| 95 |
return out
|
| 96 |
|
| 97 |
|
|
|
|
| 129 |
logger.debug(
|
| 130 |
f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
| 131 |
context_stack, answer_stack = self.batcher.get_batch(split='train')
|
| 132 |
+
logits = self.model(context_stack.to(
|
| 133 |
self.device), answer_stack.to(self.device))
|
| 134 |
+
batch, block, vocab = logits.shape
|
| 135 |
+
# Reformat logits and val so each entry can be compared
|
| 136 |
+
logits = logits.view(batch * block, vocab).to(self.device)
|
| 137 |
+
answer_stack = answer_stack.view(batch * block).to(self.device)
|
| 138 |
+
# Compare predicted tokens to actual
|
| 139 |
+
loss = F.cross_entropy(logits, answer_stack)
|
| 140 |
self.optimizer.zero_grad(set_to_none=True)
|
| 141 |
loss.backward()
|
| 142 |
self.optimizer.step()
|
|
|
|
| 179 |
learning_rate=lr
|
| 180 |
)
|
| 181 |
self._trainer.train()
|
| 182 |
+
# set to eval phase since we're only taking user input from here on
|
| 183 |
+
self._model.eval()
|
| 184 |
|
| 185 |
def generate(self, prompt: str, response_size: int):
|
| 186 |
start_ids = encode(prompt)
|
|
|
|
| 188 |
# add batch dimension. it's just 1 batch, but we still need it cuz tensors
|
| 189 |
context = context[None, ...]
|
| 190 |
encoded = self._model.generate(
|
| 191 |
+
ctx=context, max_new_tokens=response_size)[0]
|
| 192 |
return decode(encoded.tolist())
|
dataset.py
CHANGED
|
@@ -27,6 +27,7 @@ class Batcher():
|
|
| 27 |
def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
|
| 28 |
self.device = device
|
| 29 |
self.batch_size = batch_size
|
|
|
|
| 30 |
from dataset import make_dataset
|
| 31 |
train_data = make_dataset('train')
|
| 32 |
val_data = make_dataset('validation')
|
|
@@ -41,5 +42,5 @@ class Batcher():
|
|
| 41 |
context_stack = torch.stack(
|
| 42 |
[data[i:i+self.block_size] for i in random_indexes]).to(self.device)
|
| 43 |
answer_stack = torch.stack(
|
| 44 |
-
[data[i+1:i+self.block_size+1] for i in random_indexes])
|
| 45 |
return context_stack, answer_stack
|
|
|
|
| 27 |
def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
|
| 28 |
self.device = device
|
| 29 |
self.batch_size = batch_size
|
| 30 |
+
self.block_size = block_size
|
| 31 |
from dataset import make_dataset
|
| 32 |
train_data = make_dataset('train')
|
| 33 |
val_data = make_dataset('validation')
|
|
|
|
| 42 |
context_stack = torch.stack(
|
| 43 |
[data[i:i+self.block_size] for i in random_indexes]).to(self.device)
|
| 44 |
answer_stack = torch.stack(
|
| 45 |
+
[data[i+1:i+self.block_size+1] for i in random_indexes]).to(self.device)
|
| 46 |
return context_stack, answer_stack
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -8,7 +8,7 @@ package-mode = false
|
|
| 8 |
|
| 9 |
[tool.poetry.dependencies]
|
| 10 |
python = "^3.10"
|
| 11 |
-
torch = "^2.3.0"
|
| 12 |
numpy = "^1.26.4"
|
| 13 |
datasets = "^2.19.0"
|
| 14 |
tiktoken = "^0.6.0"
|
|
@@ -17,6 +17,12 @@ tiktoken = "^0.6.0"
|
|
| 17 |
[tool.poetry.group.dev.dependencies]
|
| 18 |
ipykernel = "^6.29.4"
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
[build-system]
|
| 21 |
requires = ["poetry-core"]
|
| 22 |
build-backend = "poetry.core.masonry.api"
|
|
|
|
| 8 |
|
| 9 |
[tool.poetry.dependencies]
|
| 10 |
python = "^3.10"
|
| 11 |
+
torch = { version = "^2.3.0", source = "pytorch-gpu" }
|
| 12 |
numpy = "^1.26.4"
|
| 13 |
datasets = "^2.19.0"
|
| 14 |
tiktoken = "^0.6.0"
|
|
|
|
| 17 |
[tool.poetry.group.dev.dependencies]
|
| 18 |
ipykernel = "^6.29.4"
|
| 19 |
|
| 20 |
+
|
| 21 |
+
[[tool.poetry.source]]
|
| 22 |
+
name = "pytorch-gpu"
|
| 23 |
+
url = "https://download.pytorch.org/whl/cu118"
|
| 24 |
+
priority = "supplemental"
|
| 25 |
+
|
| 26 |
[build-system]
|
| 27 |
requires = ["poetry-core"]
|
| 28 |
build-backend = "poetry.core.masonry.api"
|