Adam Beedell commited on
Commit
b4ee00a
·
1 Parent(s): a1093cf

please oh god work this time

Browse files
Files changed (2) hide show
  1. ABModel.py +177 -0
  2. pytorch_model.bin +3 -0
ABModel.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Adam's model .py
2
+
3
+ ###add dependancies
4
+ import os
5
+ import bz2
6
+ import csv
7
+ import torch
8
+ print(torch.__version__)
9
+ print(torch.cuda.is_available()) ## looking for True
10
+ import torch.nn as NN
11
+ import torch.nn.functional as F
12
+ import torch.optim as optim
13
+ from collections import deque
14
+ import itertools
15
+
16
+ if torch.cuda.is_available():
17
+ device = torch.device("cuda")
18
+ elif torch.backends.mps.is_available():
19
+ device = torch.device("mps")
20
+ else:
21
+ device = torch.device("cpu")
22
+
23
+
24
+ ### hyperparameters
25
+ windowsize = 2 # words either side of the target word
26
+ windowsize = windowsize * 2 + 1
27
+ split_ratio = 0.8 # 80% for training, 20% for testing
28
+ embed_dim = 111
29
+
30
+
31
+ ### Goal - Import text8
32
+
33
+ text8 = bz2.open('wikipedia_data.txt.bz2', 'rt').read() # Read the text8 dataset from a bz2 compressed file #### Not actually .bz2 at the moment, but this is how it will be in the future
34
+ text8 = text8.split() # Split the text into words
35
+ text8.append('<unk>') # Add an unknown token to the vocabulary
36
+
37
+ #>>> len(text8)
38
+ #17005207
39
+ #>>> len(set(text8))
40
+ #253854
41
+ #
42
+ # print(f"Number of words in text8: {len(text8)}") # Uncomment to see the number of words in the dataset
43
+ # print(f"First 10 words in text8: {text8[:10]}") # Uncomment to see the first 10 words in the dataset
44
+ # print(f"Distinct words in text8: {len(set(text8))}") # Uncomment to see the number of distinct words in the dataset")
45
+
46
+
47
+ ### tokenize text8
48
+
49
+ vocablist = set(text8) ## deduping, not sure this is required
50
+ vocabsize = len(vocablist) # Number of unique words in the vocabulary
51
+ word2idx = {w: i for i, w in enumerate(sorted(vocablist))} ## i sets an index, w is the word
52
+
53
+ unk_idx = word2idx['<unk>'] # Index for the unknown token
54
+ idx2word = {i: w for w, i in word2idx.items()}
55
+
56
+ windows = list(zip(*[iter(text8)]*windowsize)) # Group words into batches of size batch_size
57
+
58
+ #3401041
59
+
60
+ split = int(len(windows) * split_ratio) # Split the dataset into training and testing sets
61
+ train_windows = windows[:split]
62
+ test_windows = windows[split:]
63
+
64
+
65
+ #train_dataset = text8[:len(text8)*0.8] # 80% for training
66
+ #test_dataset = text8[len(text8)*0.8:] # 20% for testing
67
+
68
+ #def train_generator(windows, word2idx, unk_idx):
69
+ # """Generator function to yield context and target pairs for training."""
70
+ # for w1, w2, w3, w4, w5 in windows:
71
+ # ctx = [word2idx.get(w, unk_idx) for w in (w1, w2, w4, w5)]
72
+ # yield torch.tensor(ctx), tgt
73
+ # tgt = word2idx.get(w3, unk_idx)
74
+
75
+
76
+ #traintensors = train_generator(train_dataset, word2idx, unk_idx)
77
+ #testtensors = train_generator(test_dataset, word2idx, unk_idx)
78
+
79
+
80
+
81
+ class MaskedCBOWDataset(torch.utils.data.IterableDataset):
82
+ def __init__(self, windows, word2idx, unk_idx):
83
+ self.windows = windows
84
+ self.word2idx = word2idx
85
+ self.unk_idx = unk_idx
86
+
87
+ def __iter__(self):
88
+ for w1, w2, w3, w4, w5 in self.windows:
89
+ ctx = [self.word2idx.get(w, self.unk_idx) for w in (w1, w2, w4, w5)]
90
+ tgt = self.word2idx.get(w3, self.unk_idx)
91
+ yield torch.tensor(ctx), tgt
92
+
93
+
94
+ #train_dataset = MaskedCBOWDataset(train_windows, word2idx, unk_idx)
95
+ #train_loader = DataLoader(train_dataset, batch_size=128)
96
+
97
+ # Example usage of the generator
98
+
99
+ #for context, target in gen:
100
+ # print(context, target)
101
+
102
+ #/eg
103
+
104
+
105
+ ### create model architecture
106
+
107
+ # Create DataLoaders
108
+ train_loader = torch.utils.data.DataLoader(
109
+ MaskedCBOWDataset(train_windows, word2idx, unk_idx),
110
+ batch_size=64,
111
+ #shuffle=True
112
+ )
113
+ test_loader = torch.utils.data.DataLoader(
114
+ MaskedCBOWDataset(test_windows, word2idx, unk_idx),
115
+ batch_size=64,
116
+ #shuffle=False
117
+ )
118
+ #print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
119
+
120
+ for i, (context, target) in enumerate(train_loader):
121
+ print(f"Batch {i}:")
122
+ print(f" Context shape: {context.shape}") # expect [batch_size, 4]
123
+ print(f" Target shape: {target.shape}") # expect [batch_size]
124
+ print(f" First row: {context[0].tolist()} → {target[0].item()}")
125
+ if i == 2: break # only show a few batches
126
+
127
+
128
+ ### create model
129
+
130
+ class word2vec(NN.Module): ### This creates a class for our specific NN, inheriting from the pytorch equivalent
131
+ def __init__(self):
132
+ super().__init__() ## super goes up one level to the torch NN module, and initializes the net
133
+ self.emb = NN.Embedding(vocabsize, embed_dim) # 111 to be different
134
+ self.out = NN.Linear(embed_dim, vocabsize) # predict vocab word from averaged context
135
+ def forward(self, x): # x: [batch, 4]
136
+ x = self.emb(x) # → [batch, 4, embed_dim]
137
+ x = x.mean(dim=1) # → [batch, embed_dim] ← averaging context vectors
138
+ x = F.relu(x) # optional, but can help
139
+ x = self.out(x) # → [batch, vocab_size]
140
+ return x # raw logits
141
+
142
+ loss_function = NN.CrossEntropyLoss() # using built-in loss function
143
+ model = word2vec().to(device) ##create the model as described above
144
+ optimizer = optim.Adam(model.parameters(), lr=0.001) ### lr = learning rate, 0.001 is apparently a "normal" value. Adam is the optimizer chosen, also fairly default
145
+
146
+
147
+ ##### do training
148
+
149
+ num_epochs = 1 ## passes through the dataset
150
+
151
+ for epoch in range(num_epochs):
152
+ for context, target in train_loader: #note uses batches defined earlier
153
+
154
+ context = context.to(device) # move data to the selected device
155
+ target = target.to(device) # move data to the selected device
156
+ optimizer.zero_grad() ### reset gradients each time
157
+
158
+ outputs = model(context) # forward pass
159
+ loss = loss_function(outputs, target)
160
+
161
+ loss.backward() ## backprop method created by pytorch crossentropyloss function, very convenient
162
+ optimizer.step()
163
+
164
+ print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
165
+
166
+
167
+
168
+ ### output weights
169
+
170
+ torch.save(model.state_dict(), "ABembeddingsweights.pth")
171
+ torch.save(model, "ABembeddingsfullmodel.pth")
172
+
173
+
174
+ ### / training
175
+
176
+
177
+ ### train model
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3131deb251e881efb6f4d60f5c894b3334eab20d3ce828db674ad96d67b9062
3
+ size 226442143