edwjin commited on
Commit
03896ba
·
verified ·
1 Parent(s): bebd89b

Rename main.py to load_texts.py

Browse files
Files changed (2) hide show
  1. load_texts.py +17 -0
  2. main.py +0 -341
load_texts.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def load_texts(filename):
2
+ """
3
+ This function loads all texts from the specified directory, ignoring any files with "test" in their name. The text is used for "training" the tokenizer. Since our tokenizer is simple, we don't need to do any training, but we still need to ignore the test data.
4
+ """
5
+
6
+ # texts = []
7
+ # files = os.listdir(directory)
8
+ # for filename in files:
9
+ # if "test" in filename: ## don't "read test files"
10
+ # continue
11
+ # with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
12
+ # texts.append(file.read())
13
+ # return texts
14
+
15
+ with open(filename, 'r', encoding='utf-8') as file:
16
+ for line in file:
17
+ yield line.strip()
main.py DELETED
@@ -1,341 +0,0 @@
1
- import argparse
2
- import torch
3
- from torch import nn
4
- from torch.utils.data import DataLoader
5
- from torch.nn.utils.rnn import pad_sequence
6
- from torch.nn import functional as F
7
-
8
- import os
9
-
10
- from utilities import Utilities
11
- from tokenizer import SimpleTokenizer
12
- from dataset import SpeechesClassificationDataset
13
-
14
- from constants import seed, batch_size, block_size, learning_rate, n_embd, n_head, n_layer, n_input, n_output, n_hidden, epochs_CLS
15
-
16
- from transformer import Classifier
17
-
18
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
-
20
- eval_interval = 100 # How often to evaluate train and test perplexity during training
21
- max_iters = 500 # For language modeling, we can process all the batches for the entire dataset, but that takes a while, so we'll limit it to 500 iterations. For batch size of 16 and block size of 32, this is roughly, this is 500 * 16 * 32 = 256000 tokens, SOTA LMs are trained on trillions of tokens, so this is a very small dataset.
22
- eval_iters = 200 # Number of iterations to evaluate perplexity on the test set
23
-
24
-
25
- def load_texts(filename):
26
- """
27
- This function loads all texts from the specified directory, ignoring any files with "test" in their name. The text is used for "training" the tokenizer. Since our tokenizer is simple, we don't need to do any training, but we still need to ignore the test data.
28
- """
29
-
30
- # texts = []
31
- # files = os.listdir(directory)
32
- # for filename in files:
33
- # if "test" in filename: ## don't "read test files"
34
- # continue
35
- # with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
36
- # texts.append(file.read())
37
- # return texts
38
-
39
- with open(filename, 'r', encoding='utf-8') as file:
40
- for line in file:
41
- yield line.strip()
42
-
43
-
44
- def collate_batch(batch):
45
- """ Collate a batch of data into a single tensor with padding."""
46
- data, labels = zip(*batch) # Separate the data and labels
47
- # Pad sequences to the fixed length
48
- padded_sequences = pad_sequence(data, batch_first=True, padding_value=0)
49
- padded_sequences = padded_sequences[:, :block_size] # Truncate if longer
50
- # Add padding if shorter
51
- padded_sequences = torch.nn.functional.pad(padded_sequences, (0, max(0, block_size - padded_sequences.shape[1])), "constant", 0)
52
- labels = torch.stack(labels)
53
- return padded_sequences, labels
54
-
55
-
56
- def compute_perplexity(decoderLMmodel: Decoder, data_loader, eval_iters=100):
57
- """ Compute the perplexity of the decoderLMmodel on the data in data_loader.
58
- Make sure to use the cross entropy loss for the decoderLMmodel.
59
- """
60
- decoderLMmodel.eval()
61
- losses= []
62
- for X, Y in data_loader:
63
- X, Y = X.to(device), Y.to(device)
64
- logits, _ = decoderLMmodel(X) # your model should be computing the cross entropy loss
65
- B, T, C = logits.shape
66
-
67
- logits = logits.view(B*T, C)
68
- targets = Y.view(B*T)
69
- loss = F.cross_entropy(logits, targets)
70
-
71
- losses.append(loss.item())
72
- # total_loss += loss.item()
73
- if len(losses) >= eval_iters:
74
- break
75
-
76
- losses = torch.tensor(losses)
77
- mean_loss = losses.mean()
78
- perplexity = torch.exp(mean_loss).item() # Calculate perplexity as exp(mean loss)
79
-
80
- decoderLMmodel.train()
81
- return perplexity
82
-
83
- def compute_classifier_accuracy(classifier: Classifier, data_loader):
84
- """ Compute the accuracy of the classifier on the data in data_loader."""
85
- classifier.eval()
86
- total_correct = 0
87
- total_samples = 0
88
- with torch.no_grad():
89
- for X, Y in data_loader:
90
- X, Y = X.to(device), Y.to(device)
91
- outputs, _ = classifier(X)
92
- _, predicted = torch.max(outputs.data, 1)
93
- total_correct += (predicted == Y).sum().item()
94
- total_samples += Y.size(0)
95
- accuracy = (100 * total_correct / total_samples)
96
- classifier.train()
97
- return accuracy
98
-
99
- def train_epoch(data_loader, model, optimizer):
100
- # size = len(data_loader.dataset)
101
-
102
- num_batches = len(data_loader)
103
- model.train()
104
- train_loss, total_correct, total_samples = 0, 0, 0
105
-
106
- for batch, (X, Y) in enumerate(data_loader):
107
- # X = X.float()
108
- # Compute prediction error
109
- # print('----------------------------------------------------')
110
- pred, _ = model(X)
111
-
112
- _, predicted = torch.max(pred.data, 1)
113
- total_correct += (predicted == Y).sum().item()
114
- total_samples += Y.size(0)
115
-
116
- # print("pred: ", pred.shape)
117
- # print("predicted", predicted.shape)
118
- # print("y", Y.shape)
119
- # print('----------------------------------------------------')
120
-
121
- loss = F.cross_entropy(input=pred, target=Y)
122
-
123
- train_loss += loss.item()
124
-
125
- # Backpropagation
126
- optimizer.zero_grad()
127
- loss.backward()
128
- optimizer.step()
129
-
130
- average_train_loss = train_loss / num_batches
131
- accuracy = (100 * total_correct / total_samples)
132
- return accuracy, average_train_loss
133
-
134
-
135
- # ------------------------------Classifier Code---------------------------------- #
136
- def run_classifier():
137
-
138
- print("Loading data and creating tokenizer ...")
139
-
140
- texts = []
141
-
142
- for text in load_texts('train.tsv'):
143
- texts.append(text)
144
-
145
- tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
146
- print("Vocabulary size is", tokenizer.vocab_size)
147
-
148
- print(len(texts))
149
- print(texts[0])
150
-
151
- # train_CLS_dataset = SpeechesClassificationDataset(tokenizer, "speechesdataset/train_CLS.tsv")
152
- # train_CLS_loader = DataLoader(train_CLS_dataset, batch_size=batch_size, collate_fn=collate_batch, shuffle=True)
153
-
154
- # test_CLS_dataset = SpeechesClassificationDataset(tokenizer, "speechesdataset/test_CLS.tsv")
155
- # test_CLS_loader = DataLoader(test_CLS_dataset, batch_size=batch_size, collate_fn=collate_batch, shuffle=True)
156
-
157
- # classifier_model = Classifier(tokenizer.vocab_size)
158
-
159
- # total_params = sum(p.numel() for p in classifier_model.parameters())
160
- # print("Total number of parameters:", total_params)
161
-
162
- # # Adam optimizer
163
- # optimizer = torch.optim.Adam(classifier_model.parameters(), lr=learning_rate)
164
-
165
- # # for the classification task, you will train for a fixed number of epochs like this:
166
- # for epoch in range(epochs_CLS):
167
- # train_accuracy, train_loss = train_epoch(train_CLS_loader, classifier_model, optimizer)
168
- # print(f'Epoch #{epoch+1}: \t train accuracy {train_accuracy:.3f}\t train loss {train_loss:.3f}\t test accuracy {compute_classifier_accuracy(classifier_model, test_CLS_loader):.3f}')
169
-
170
- # torch.save(classifier_model.state_dict(), 'classifier_model_dict.pth')
171
-
172
-
173
- # ------------------------------Classifier Code---------------------------------- #
174
-
175
-
176
- # ------------------------------Decoder Code---------------------------------- #
177
- def run_decoder():
178
- print("Loading data and creating tokenizer ...")
179
- texts = load_texts('speechesdataset')
180
- tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
181
- print("Vocabulary size is", tokenizer.vocab_size)
182
-
183
- inputfile = "speechesdataset/train_LM.txt"
184
- with open(inputfile, 'r', encoding='utf-8') as f:
185
- lmtrainText = f.read()
186
- train_LM_dataset = LanguageModelingDataset(tokenizer, lmtrainText, block_size)
187
- train_LM_loader = DataLoader(train_LM_dataset, batch_size=batch_size, shuffle=True)
188
-
189
- decoder = Decoder(tokenizer.vocab_size)
190
-
191
- total_params = sum(p.numel() for p in decoder.parameters())
192
- print("Total number of parameters:", total_params)
193
-
194
- optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
195
-
196
- # for the language modeling task, you will iterate over the training data for a fixed number of iterations like this:
197
- for i, (xb, yb) in enumerate(train_LM_loader):
198
- if i >= max_iters: # stop after 500 batches
199
- break
200
- xb, yb = xb.to(device), yb.to(device)
201
-
202
- if (i+1)%100 == 0:
203
- print("Train Data Perplexity At Iteration ", (i+1), compute_perplexity(decoder, train_LM_loader))
204
-
205
- # LM training code here
206
-
207
- # evaluate the loss
208
- logits, _ = decoder(xb)
209
- B, T, C = logits.shape
210
-
211
- logits = logits.view(B*T, C)
212
- targets = yb.view(B*T)
213
- loss = F.cross_entropy(logits, targets)
214
-
215
- optimizer.zero_grad()
216
- loss.backward()
217
- optimizer.step()
218
-
219
- # calculate perplexity
220
- files = ['speechesdataset/test_LM_hbush.tsv', 'speechesdataset/test_LM_obama.txt', 'speechesdataset/test_LM_wbush.txt']
221
-
222
- for file in files:
223
- with open(file, 'r', encoding='utf-8') as f:
224
- data = f.read()
225
- test_LM_dataset = LanguageModelingDataset(tokenizer, data, block_size)
226
- test_LM_loader = DataLoader(test_LM_dataset, batch_size=batch_size, shuffle=True)
227
-
228
- print(file, compute_perplexity(decoder, test_LM_loader))
229
-
230
- print("------------------------SANITY CHECK DECODER------------------------")
231
- print("Vocabulary size is", tokenizer.vocab_size)
232
-
233
- ec = Decoder(tokenizer.vocab_size)
234
- u = Utilities(tokenizer, ec)
235
-
236
- u.sanity_check('With a simple oath, we affirm old traditions and make new beginnings.', block_size)
237
-
238
- # ------------------------------Decoder Code---------------------------------- #
239
-
240
-
241
- def run_sanity_check_encoder():
242
- print("Loading data and creating tokenizer ...")
243
- print("------------------------SANITY CHECK ENCODER------------------------")
244
- tokenizer = SimpleTokenizer('The man who passes the sentence should swing the sword. If you would take a man\'s life') # create a tokenizer from the data
245
- print("Vocabulary size is", tokenizer.vocab_size)
246
-
247
- ec = Classifier(tokenizer.vocab_size)
248
- u = Utilities(tokenizer, ec)
249
-
250
- u.sanity_check('The man who passes the sentence should swing the sword. If you would take a man\'s life', block_size)
251
-
252
-
253
- def run_sanity_check_decoder():
254
- print("Loading data and creating tokenizer ...")
255
- texts = load_texts('speechesdataset')
256
- tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
257
-
258
- ec = Decoder(tokenizer.vocab_size)
259
- u = Utilities(tokenizer, ec)
260
-
261
- u.sanity_check('With a simple oath, we affirm old traditions and make new beginnings.', block_size)
262
-
263
-
264
- def run_ec_decoder():
265
- print("EXTRA CREDIT:")
266
- texts = load_texts('speechesdataset')
267
- tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
268
- print("Vocabulary size is", tokenizer.vocab_size)
269
-
270
- inputfile = "speechesdataset/train_LM.txt"
271
- with open(inputfile, 'r', encoding='utf-8') as f:
272
- lmtrainText = f.read()
273
- train_LM_dataset = LanguageModelingDataset(tokenizer, lmtrainText, block_size)
274
- train_LM_loader = DataLoader(train_LM_dataset, batch_size=batch_size, shuffle=True)
275
-
276
- decoder = DecoderEC(tokenizer.vocab_size, 4, 6)
277
-
278
- total_params = sum(p.numel() for p in decoder.parameters())
279
- print("Total number of parameters:", total_params)
280
-
281
- optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
282
-
283
- # for the language modeling task, you will iterate over the training data for a fixed number of iterations like this:
284
- for i, (xb, yb) in enumerate(train_LM_loader):
285
- if i >= max_iters: # stop after 500 batches
286
- break
287
- xb, yb = xb.to(device), yb.to(device)
288
-
289
- if (i+1)%100 == 0:
290
- print("Train Data Perplexity At Iteration ", (i+1), compute_perplexity(decoder, train_LM_loader))
291
-
292
- # LM training code here
293
-
294
- # evaluate the loss
295
- logits, _ = decoder(xb)
296
- B, T, C = logits.shape
297
-
298
- logits = logits.view(B*T, C)
299
- targets = yb.view(B*T)
300
- loss = F.cross_entropy(logits, targets)
301
-
302
- optimizer.zero_grad()
303
- loss.backward()
304
- optimizer.step()
305
-
306
- # calculate perplexity
307
- files = ['speechesdataset/test_LM_hbush.tsv', 'speechesdataset/test_LM_obama.txt', 'speechesdataset/test_LM_wbush.txt']
308
-
309
- for file in files:
310
- with open(file, 'r', encoding='utf-8') as f:
311
- data = f.read()
312
- test_LM_dataset = LanguageModelingDataset(tokenizer, data, block_size)
313
- test_LM_loader = DataLoader(test_LM_dataset, batch_size=batch_size, shuffle=True)
314
-
315
- print(file, compute_perplexity(decoder, test_LM_loader))
316
-
317
- # generate some text!
318
- context = torch.zeros((1, 1), dtype=torch.long, device=device)
319
- print(tokenizer.decode(decoder.generate(context, max_new_tokens=500)[0].tolist()))
320
-
321
- # ------------------------------MAIN---------------------------------- #
322
- def main():
323
- parser = argparse.ArgumentParser(description="Run classifier or decoder")
324
- parser.add_argument("-mode", choices=["c", "d", "sc", "sd", "ecd"], help="Choose mode: 'c' for classifier, 'd' for decoder, 'sc' for sanity checking classifier, 'sd' for sanity checking decoder, 'ecd' for decoder EC")
325
- args = parser.parse_args()
326
-
327
- if args.mode == "c":
328
- run_classifier()
329
- elif args.mode == "d":
330
- run_decoder()
331
- elif args.mode == 'sc':
332
- run_sanity_check_encoder()
333
- elif args.mode == 'sd':
334
- run_sanity_check_decoder()
335
- elif args.mode == 'ecd':
336
- run_ec_decoder()
337
- else:
338
- print("Invalid mode.")
339
-
340
- if __name__ == "__main__":
341
- main()