Adam Beedell
commited on
Commit
·
b4ee00a
1
Parent(s):
a1093cf
please oh god work this time
Browse files- ABModel.py +177 -0
- pytorch_model.bin +3 -0
ABModel.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Adam's model .py
|
| 2 |
+
|
| 3 |
+
###add dependancies
|
| 4 |
+
import os
|
| 5 |
+
import bz2
|
| 6 |
+
import csv
|
| 7 |
+
import torch
|
| 8 |
+
print(torch.__version__)
|
| 9 |
+
print(torch.cuda.is_available()) ## looking for True
|
| 10 |
+
import torch.nn as NN
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
import torch.optim as optim
|
| 13 |
+
from collections import deque
|
| 14 |
+
import itertools
|
| 15 |
+
|
| 16 |
+
if torch.cuda.is_available():
|
| 17 |
+
device = torch.device("cuda")
|
| 18 |
+
elif torch.backends.mps.is_available():
|
| 19 |
+
device = torch.device("mps")
|
| 20 |
+
else:
|
| 21 |
+
device = torch.device("cpu")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
### hyperparameters
|
| 25 |
+
windowsize = 2 # words either side of the target word
|
| 26 |
+
windowsize = windowsize * 2 + 1
|
| 27 |
+
split_ratio = 0.8 # 80% for training, 20% for testing
|
| 28 |
+
embed_dim = 111
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
### Goal - Import text8
|
| 32 |
+
|
| 33 |
+
text8 = bz2.open('wikipedia_data.txt.bz2', 'rt').read() # Read the text8 dataset from a bz2 compressed file #### Not actually .bz2 at the moment, but this is how it will be in the future
|
| 34 |
+
text8 = text8.split() # Split the text into words
|
| 35 |
+
text8.append('<unk>') # Add an unknown token to the vocabulary
|
| 36 |
+
|
| 37 |
+
#>>> len(text8)
|
| 38 |
+
#17005207
|
| 39 |
+
#>>> len(set(text8))
|
| 40 |
+
#253854
|
| 41 |
+
#
|
| 42 |
+
# print(f"Number of words in text8: {len(text8)}") # Uncomment to see the number of words in the dataset
|
| 43 |
+
# print(f"First 10 words in text8: {text8[:10]}") # Uncomment to see the first 10 words in the dataset
|
| 44 |
+
# print(f"Distinct words in text8: {len(set(text8))}") # Uncomment to see the number of distinct words in the dataset")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
### tokenize text8
|
| 48 |
+
|
| 49 |
+
vocablist = set(text8) ## deduping, not sure this is required
|
| 50 |
+
vocabsize = len(vocablist) # Number of unique words in the vocabulary
|
| 51 |
+
word2idx = {w: i for i, w in enumerate(sorted(vocablist))} ## i sets an index, w is the word
|
| 52 |
+
|
| 53 |
+
unk_idx = word2idx['<unk>'] # Index for the unknown token
|
| 54 |
+
idx2word = {i: w for w, i in word2idx.items()}
|
| 55 |
+
|
| 56 |
+
windows = list(zip(*[iter(text8)]*windowsize)) # Group words into batches of size batch_size
|
| 57 |
+
|
| 58 |
+
#3401041
|
| 59 |
+
|
| 60 |
+
split = int(len(windows) * split_ratio) # Split the dataset into training and testing sets
|
| 61 |
+
train_windows = windows[:split]
|
| 62 |
+
test_windows = windows[split:]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
#train_dataset = text8[:len(text8)*0.8] # 80% for training
|
| 66 |
+
#test_dataset = text8[len(text8)*0.8:] # 20% for testing
|
| 67 |
+
|
| 68 |
+
#def train_generator(windows, word2idx, unk_idx):
|
| 69 |
+
# """Generator function to yield context and target pairs for training."""
|
| 70 |
+
# for w1, w2, w3, w4, w5 in windows:
|
| 71 |
+
# ctx = [word2idx.get(w, unk_idx) for w in (w1, w2, w4, w5)]
|
| 72 |
+
# yield torch.tensor(ctx), tgt
|
| 73 |
+
# tgt = word2idx.get(w3, unk_idx)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
#traintensors = train_generator(train_dataset, word2idx, unk_idx)
|
| 77 |
+
#testtensors = train_generator(test_dataset, word2idx, unk_idx)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class MaskedCBOWDataset(torch.utils.data.IterableDataset):
|
| 82 |
+
def __init__(self, windows, word2idx, unk_idx):
|
| 83 |
+
self.windows = windows
|
| 84 |
+
self.word2idx = word2idx
|
| 85 |
+
self.unk_idx = unk_idx
|
| 86 |
+
|
| 87 |
+
def __iter__(self):
|
| 88 |
+
for w1, w2, w3, w4, w5 in self.windows:
|
| 89 |
+
ctx = [self.word2idx.get(w, self.unk_idx) for w in (w1, w2, w4, w5)]
|
| 90 |
+
tgt = self.word2idx.get(w3, self.unk_idx)
|
| 91 |
+
yield torch.tensor(ctx), tgt
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
#train_dataset = MaskedCBOWDataset(train_windows, word2idx, unk_idx)
|
| 95 |
+
#train_loader = DataLoader(train_dataset, batch_size=128)
|
| 96 |
+
|
| 97 |
+
# Example usage of the generator
|
| 98 |
+
|
| 99 |
+
#for context, target in gen:
|
| 100 |
+
# print(context, target)
|
| 101 |
+
|
| 102 |
+
#/eg
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
### create model architecture
|
| 106 |
+
|
| 107 |
+
# Create DataLoaders
|
| 108 |
+
train_loader = torch.utils.data.DataLoader(
|
| 109 |
+
MaskedCBOWDataset(train_windows, word2idx, unk_idx),
|
| 110 |
+
batch_size=64,
|
| 111 |
+
#shuffle=True
|
| 112 |
+
)
|
| 113 |
+
test_loader = torch.utils.data.DataLoader(
|
| 114 |
+
MaskedCBOWDataset(test_windows, word2idx, unk_idx),
|
| 115 |
+
batch_size=64,
|
| 116 |
+
#shuffle=False
|
| 117 |
+
)
|
| 118 |
+
#print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
|
| 119 |
+
|
| 120 |
+
for i, (context, target) in enumerate(train_loader):
|
| 121 |
+
print(f"Batch {i}:")
|
| 122 |
+
print(f" Context shape: {context.shape}") # expect [batch_size, 4]
|
| 123 |
+
print(f" Target shape: {target.shape}") # expect [batch_size]
|
| 124 |
+
print(f" First row: {context[0].tolist()} → {target[0].item()}")
|
| 125 |
+
if i == 2: break # only show a few batches
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
### create model
|
| 129 |
+
|
| 130 |
+
class word2vec(NN.Module): ### This creates a class for our specific NN, inheriting from the pytorch equivalent
|
| 131 |
+
def __init__(self):
|
| 132 |
+
super().__init__() ## super goes up one level to the torch NN module, and initializes the net
|
| 133 |
+
self.emb = NN.Embedding(vocabsize, embed_dim) # 111 to be different
|
| 134 |
+
self.out = NN.Linear(embed_dim, vocabsize) # predict vocab word from averaged context
|
| 135 |
+
def forward(self, x): # x: [batch, 4]
|
| 136 |
+
x = self.emb(x) # → [batch, 4, embed_dim]
|
| 137 |
+
x = x.mean(dim=1) # → [batch, embed_dim] ← averaging context vectors
|
| 138 |
+
x = F.relu(x) # optional, but can help
|
| 139 |
+
x = self.out(x) # → [batch, vocab_size]
|
| 140 |
+
return x # raw logits
|
| 141 |
+
|
| 142 |
+
loss_function = NN.CrossEntropyLoss() # using built-in loss function
|
| 143 |
+
model = word2vec().to(device) ##create the model as described above
|
| 144 |
+
optimizer = optim.Adam(model.parameters(), lr=0.001) ### lr = learning rate, 0.001 is apparently a "normal" value. Adam is the optimizer chosen, also fairly default
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
##### do training
|
| 148 |
+
|
| 149 |
+
num_epochs = 1 ## passes through the dataset
|
| 150 |
+
|
| 151 |
+
for epoch in range(num_epochs):
|
| 152 |
+
for context, target in train_loader: #note uses batches defined earlier
|
| 153 |
+
|
| 154 |
+
context = context.to(device) # move data to the selected device
|
| 155 |
+
target = target.to(device) # move data to the selected device
|
| 156 |
+
optimizer.zero_grad() ### reset gradients each time
|
| 157 |
+
|
| 158 |
+
outputs = model(context) # forward pass
|
| 159 |
+
loss = loss_function(outputs, target)
|
| 160 |
+
|
| 161 |
+
loss.backward() ## backprop method created by pytorch crossentropyloss function, very convenient
|
| 162 |
+
optimizer.step()
|
| 163 |
+
|
| 164 |
+
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
### output weights
|
| 169 |
+
|
| 170 |
+
torch.save(model.state_dict(), "ABembeddingsweights.pth")
|
| 171 |
+
torch.save(model, "ABembeddingsfullmodel.pth")
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
### / training
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
### train model
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3131deb251e881efb6f4d60f5c894b3334eab20d3ce828db674ad96d67b9062
|
| 3 |
+
size 226442143
|