File size: 5,960 Bytes
b4ee00a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
## Adam's model .py
###add dependancies
import os
import bz2
import csv
import torch
print(torch.__version__)
print(torch.cuda.is_available()) ## looking for True
import torch.nn as NN
import torch.nn.functional as F
import torch.optim as optim
from collections import deque
import itertools
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
### hyperparameters
windowsize = 2 # words either side of the target word
windowsize = windowsize * 2 + 1
split_ratio = 0.8 # 80% for training, 20% for testing
embed_dim = 111
### Goal - Import text8
text8 = bz2.open('wikipedia_data.txt.bz2', 'rt').read() # Read the text8 dataset from a bz2 compressed file #### Not actually .bz2 at the moment, but this is how it will be in the future
text8 = text8.split() # Split the text into words
text8.append('<unk>') # Add an unknown token to the vocabulary
#>>> len(text8)
#17005207
#>>> len(set(text8))
#253854
#
# print(f"Number of words in text8: {len(text8)}") # Uncomment to see the number of words in the dataset
# print(f"First 10 words in text8: {text8[:10]}") # Uncomment to see the first 10 words in the dataset
# print(f"Distinct words in text8: {len(set(text8))}") # Uncomment to see the number of distinct words in the dataset")
### tokenize text8
vocablist = set(text8) ## deduping, not sure this is required
vocabsize = len(vocablist) # Number of unique words in the vocabulary
word2idx = {w: i for i, w in enumerate(sorted(vocablist))} ## i sets an index, w is the word
unk_idx = word2idx['<unk>'] # Index for the unknown token
idx2word = {i: w for w, i in word2idx.items()}
windows = list(zip(*[iter(text8)]*windowsize)) # Group words into batches of size batch_size
#3401041
split = int(len(windows) * split_ratio) # Split the dataset into training and testing sets
train_windows = windows[:split]
test_windows = windows[split:]
#train_dataset = text8[:len(text8)*0.8] # 80% for training
#test_dataset = text8[len(text8)*0.8:] # 20% for testing
#def train_generator(windows, word2idx, unk_idx):
# """Generator function to yield context and target pairs for training."""
# for w1, w2, w3, w4, w5 in windows:
# ctx = [word2idx.get(w, unk_idx) for w in (w1, w2, w4, w5)]
# yield torch.tensor(ctx), tgt
# tgt = word2idx.get(w3, unk_idx)
#traintensors = train_generator(train_dataset, word2idx, unk_idx)
#testtensors = train_generator(test_dataset, word2idx, unk_idx)
class MaskedCBOWDataset(torch.utils.data.IterableDataset):
def __init__(self, windows, word2idx, unk_idx):
self.windows = windows
self.word2idx = word2idx
self.unk_idx = unk_idx
def __iter__(self):
for w1, w2, w3, w4, w5 in self.windows:
ctx = [self.word2idx.get(w, self.unk_idx) for w in (w1, w2, w4, w5)]
tgt = self.word2idx.get(w3, self.unk_idx)
yield torch.tensor(ctx), tgt
#train_dataset = MaskedCBOWDataset(train_windows, word2idx, unk_idx)
#train_loader = DataLoader(train_dataset, batch_size=128)
# Example usage of the generator
#for context, target in gen:
# print(context, target)
#/eg
### create model architecture
# Create DataLoaders
train_loader = torch.utils.data.DataLoader(
MaskedCBOWDataset(train_windows, word2idx, unk_idx),
batch_size=64,
#shuffle=True
)
test_loader = torch.utils.data.DataLoader(
MaskedCBOWDataset(test_windows, word2idx, unk_idx),
batch_size=64,
#shuffle=False
)
#print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
for i, (context, target) in enumerate(train_loader):
print(f"Batch {i}:")
print(f" Context shape: {context.shape}") # expect [batch_size, 4]
print(f" Target shape: {target.shape}") # expect [batch_size]
print(f" First row: {context[0].tolist()} → {target[0].item()}")
if i == 2: break # only show a few batches
### create model
class word2vec(NN.Module): ### This creates a class for our specific NN, inheriting from the pytorch equivalent
def __init__(self):
super().__init__() ## super goes up one level to the torch NN module, and initializes the net
self.emb = NN.Embedding(vocabsize, embed_dim) # 111 to be different
self.out = NN.Linear(embed_dim, vocabsize) # predict vocab word from averaged context
def forward(self, x): # x: [batch, 4]
x = self.emb(x) # → [batch, 4, embed_dim]
x = x.mean(dim=1) # → [batch, embed_dim] ← averaging context vectors
x = F.relu(x) # optional, but can help
x = self.out(x) # → [batch, vocab_size]
return x # raw logits
loss_function = NN.CrossEntropyLoss() # using built-in loss function
model = word2vec().to(device) ##create the model as described above
optimizer = optim.Adam(model.parameters(), lr=0.001) ### lr = learning rate, 0.001 is apparently a "normal" value. Adam is the optimizer chosen, also fairly default
##### do training
num_epochs = 1 ## passes through the dataset
for epoch in range(num_epochs):
for context, target in train_loader: #note uses batches defined earlier
context = context.to(device) # move data to the selected device
target = target.to(device) # move data to the selected device
optimizer.zero_grad() ### reset gradients each time
outputs = model(context) # forward pass
loss = loss_function(outputs, target)
loss.backward() ## backprop method created by pytorch crossentropyloss function, very convenient
optimizer.step()
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
### output weights
torch.save(model.state_dict(), "ABembeddingsweights.pth")
torch.save(model, "ABembeddingsfullmodel.pth")
### / training
### train model |