Spaces:

edwjin
/

docker-classifier

Sleeping

App Files Files Community

edwjin commited on Jun 29, 2024

Commit

03896ba

verified ·

1 Parent(s): bebd89b

Rename main.py to load_texts.py

Browse files

Files changed (2) hide show

load_texts.py +17 -0
main.py +0 -341

load_texts.py ADDED Viewed

	@@ -0,0 +1,17 @@

+def load_texts(filename):
+    """
+    This function loads all texts from the specified directory, ignoring any files with "test" in their name. The text is used for "training" the tokenizer. Since our tokenizer is simple, we don't need to do any training, but we still need to ignore the test data.
+    """
+    # texts = []
+    # files = os.listdir(directory)
+    # for filename in files:
+    #     if "test" in filename:  ## don't "read test files"
+    #         continue
+    #     with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
+    #         texts.append(file.read())
+    # return texts
+    with open(filename, 'r', encoding='utf-8') as file:
+        for line in file:
+            yield line.strip()

main.py DELETED Viewed

@@ -1,341 +0,0 @@
-import argparse
-import torch
-from torch import nn
-from torch.utils.data import DataLoader
-from torch.nn.utils.rnn import pad_sequence
-from torch.nn import functional as F
-import os
-from utilities import Utilities
-from tokenizer import SimpleTokenizer
-from dataset import SpeechesClassificationDataset
-from constants import seed, batch_size, block_size, learning_rate, n_embd, n_head, n_layer, n_input, n_output, n_hidden, epochs_CLS
-from transformer import Classifier
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-eval_interval = 100  # How often to evaluate train and test perplexity during training
-max_iters = 500 # For language modeling, we can process all the batches for the entire dataset, but that takes a while, so we'll limit it to 500 iterations. For batch size of 16 and block size of  32, this is roughly, this is  500 * 16 * 32 = 256000 tokens, SOTA LMs are trained on trillions of tokens, so this is a very small dataset.
-eval_iters = 200  # Number of iterations to evaluate perplexity on the test set
-def load_texts(filename):
-    """
-    This function loads all texts from the specified directory, ignoring any files with "test" in their name. The text is used for "training" the tokenizer. Since our tokenizer is simple, we don't need to do any training, but we still need to ignore the test data.
-    """
-    # texts = []
-    # files = os.listdir(directory)
-    # for filename in files:
-    #     if "test" in filename:  ## don't "read test files"
-    #         continue
-    #     with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
-    #         texts.append(file.read())
-    # return texts
-    with open(filename, 'r', encoding='utf-8') as file:
-        for line in file:
-            yield line.strip()
-def collate_batch(batch):
-    """ Collate a batch of data into a single tensor with padding."""
-    data, labels = zip(*batch)  # Separate the data and labels
-    # Pad sequences to the fixed length
-    padded_sequences = pad_sequence(data, batch_first=True, padding_value=0)
-    padded_sequences = padded_sequences[:, :block_size]  # Truncate if longer
-    # Add padding if shorter
-    padded_sequences = torch.nn.functional.pad(padded_sequences, (0, max(0, block_size - padded_sequences.shape[1])), "constant", 0)
-    labels = torch.stack(labels)
-    return padded_sequences, labels
-def compute_perplexity(decoderLMmodel: Decoder, data_loader, eval_iters=100):
-    """ Compute the perplexity of the decoderLMmodel on the data in data_loader.
-    Make sure to use the cross entropy loss for the decoderLMmodel.
-    """
-    decoderLMmodel.eval()
-    losses= []
-    for X, Y in data_loader:
-        X, Y = X.to(device), Y.to(device)
-        logits, _ = decoderLMmodel(X) # your model should be computing the cross entropy loss
-        B, T, C = logits.shape
-        logits = logits.view(B*T, C)
-        targets = Y.view(B*T)
-        loss = F.cross_entropy(logits, targets)
-        losses.append(loss.item())
-        # total_loss += loss.item()
-        if len(losses) >= eval_iters:
-            break
-    losses = torch.tensor(losses)
-    mean_loss = losses.mean()
-    perplexity = torch.exp(mean_loss).item()  # Calculate perplexity as exp(mean loss)
-    decoderLMmodel.train()
-    return perplexity
-def compute_classifier_accuracy(classifier: Classifier, data_loader):
-    """ Compute the accuracy of the classifier on the data in data_loader."""
-    classifier.eval()
-    total_correct = 0
-    total_samples = 0
-    with torch.no_grad():
-        for X, Y in data_loader:
-            X, Y = X.to(device), Y.to(device)
-            outputs, _ = classifier(X)
-            _, predicted = torch.max(outputs.data, 1)
-            total_correct += (predicted == Y).sum().item()
-            total_samples += Y.size(0)
-        accuracy = (100 * total_correct / total_samples)
-        classifier.train()
-        return accuracy
-def train_epoch(data_loader, model, optimizer):
-    # size = len(data_loader.dataset)
-    num_batches = len(data_loader)
-    model.train()
-    train_loss, total_correct, total_samples = 0, 0, 0
-    for batch, (X, Y) in enumerate(data_loader):
-        #  X = X.float()
-        # Compute prediction error
-        # print('----------------------------------------------------')
-        pred, _ = model(X)
-        _, predicted = torch.max(pred.data, 1)
-        total_correct += (predicted == Y).sum().item()
-        total_samples += Y.size(0)
-        # print("pred: ", pred.shape)
-        # print("predicted", predicted.shape)
-        # print("y", Y.shape)
-        # print('----------------------------------------------------')
-        loss = F.cross_entropy(input=pred, target=Y)
-        train_loss += loss.item()
-        # Backpropagation
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-    average_train_loss = train_loss / num_batches
-    accuracy = (100 * total_correct / total_samples)
-    return accuracy, average_train_loss
-# ------------------------------Classifier Code---------------------------------- #
-def run_classifier():
-    print("Loading data and creating tokenizer ...")
-    texts = []
-    for text in load_texts('train.tsv'):
-        texts.append(text)
-    tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
-    print("Vocabulary size is", tokenizer.vocab_size)
-    print(len(texts))
-    print(texts[0])
-    # train_CLS_dataset = SpeechesClassificationDataset(tokenizer, "speechesdataset/train_CLS.tsv")
-    # train_CLS_loader = DataLoader(train_CLS_dataset, batch_size=batch_size, collate_fn=collate_batch, shuffle=True)
-    # test_CLS_dataset = SpeechesClassificationDataset(tokenizer, "speechesdataset/test_CLS.tsv")
-    # test_CLS_loader = DataLoader(test_CLS_dataset, batch_size=batch_size, collate_fn=collate_batch, shuffle=True)
-    # classifier_model = Classifier(tokenizer.vocab_size)
-    # total_params = sum(p.numel() for p in classifier_model.parameters())
-    # print("Total number of parameters:", total_params)
-    # # Adam optimizer
-    # optimizer = torch.optim.Adam(classifier_model.parameters(), lr=learning_rate)
-    #  # for the classification task, you will train for a fixed number of epochs like this:
-    # for epoch in range(epochs_CLS):
-    #     train_accuracy, train_loss = train_epoch(train_CLS_loader, classifier_model, optimizer)
-    #     print(f'Epoch #{epoch+1}: \t train accuracy {train_accuracy:.3f}\t train loss {train_loss:.3f}\t test accuracy {compute_classifier_accuracy(classifier_model, test_CLS_loader):.3f}')
-    # torch.save(classifier_model.state_dict(), 'classifier_model_dict.pth')
-# ------------------------------Classifier Code---------------------------------- #
-# ------------------------------Decoder Code---------------------------------- #
-def run_decoder():
-    print("Loading data and creating tokenizer ...")
-    texts = load_texts('speechesdataset')
-    tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
-    print("Vocabulary size is", tokenizer.vocab_size)
-    inputfile = "speechesdataset/train_LM.txt"
-    with open(inputfile, 'r', encoding='utf-8') as f:
-        lmtrainText = f.read()
-    train_LM_dataset = LanguageModelingDataset(tokenizer, lmtrainText,  block_size)
-    train_LM_loader = DataLoader(train_LM_dataset, batch_size=batch_size, shuffle=True)
-    decoder = Decoder(tokenizer.vocab_size)
-    total_params = sum(p.numel() for p in decoder.parameters())
-    print("Total number of parameters:", total_params)
-    optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
-    # for the language modeling task, you will iterate over the training data for a fixed number of iterations like this:
-    for i, (xb, yb) in enumerate(train_LM_loader):
-        if i >= max_iters: # stop after 500 batches
-            break
-        xb, yb = xb.to(device), yb.to(device)
-        if (i+1)%100 == 0:
-            print("Train Data Perplexity At Iteration ", (i+1), compute_perplexity(decoder, train_LM_loader))
-        # LM training code here
-        # evaluate the loss
-        logits, _ = decoder(xb)
-        B, T, C = logits.shape
-        logits = logits.view(B*T, C)
-        targets = yb.view(B*T)
-        loss = F.cross_entropy(logits, targets)
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-    # calculate perplexity
-    files = ['speechesdataset/test_LM_hbush.tsv', 'speechesdataset/test_LM_obama.txt', 'speechesdataset/test_LM_wbush.txt']
-    for file in files:
-        with open(file, 'r', encoding='utf-8') as f:
-            data = f.read()
-        test_LM_dataset = LanguageModelingDataset(tokenizer, data,  block_size)
-        test_LM_loader = DataLoader(test_LM_dataset, batch_size=batch_size, shuffle=True)
-        print(file, compute_perplexity(decoder, test_LM_loader))
-    print("------------------------SANITY CHECK DECODER------------------------")
-    print("Vocabulary size is", tokenizer.vocab_size)
-    ec = Decoder(tokenizer.vocab_size)
-    u = Utilities(tokenizer, ec)
-    u.sanity_check('With a simple oath, we affirm old traditions and make new beginnings.', block_size)
-# ------------------------------Decoder Code---------------------------------- #
-def run_sanity_check_encoder():
-    print("Loading data and creating tokenizer ...")
-    print("------------------------SANITY CHECK ENCODER------------------------")
-    tokenizer = SimpleTokenizer('The man who passes the sentence should swing the sword. If you would take a man\'s life') # create a tokenizer from the data
-    print("Vocabulary size is", tokenizer.vocab_size)
-    ec = Classifier(tokenizer.vocab_size)
-    u = Utilities(tokenizer, ec)
-    u.sanity_check('The man who passes the sentence should swing the sword. If you would take a man\'s life', block_size)
-def run_sanity_check_decoder():
-    print("Loading data and creating tokenizer ...")
-    texts = load_texts('speechesdataset')
-    tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
-    ec = Decoder(tokenizer.vocab_size)
-    u = Utilities(tokenizer, ec)
-    u.sanity_check('With a simple oath, we affirm old traditions and make new beginnings.', block_size)
-def run_ec_decoder():
-    print("EXTRA CREDIT:")
-    texts = load_texts('speechesdataset')
-    tokenizer = SimpleTokenizer(' '.join(texts)) # create a tokenizer from the data
-    print("Vocabulary size is", tokenizer.vocab_size)
-    inputfile = "speechesdataset/train_LM.txt"
-    with open(inputfile, 'r', encoding='utf-8') as f:
-        lmtrainText = f.read()
-    train_LM_dataset = LanguageModelingDataset(tokenizer, lmtrainText,  block_size)
-    train_LM_loader = DataLoader(train_LM_dataset, batch_size=batch_size, shuffle=True)
-    decoder = DecoderEC(tokenizer.vocab_size, 4, 6)
-    total_params = sum(p.numel() for p in decoder.parameters())
-    print("Total number of parameters:", total_params)
-    optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
-    # for the language modeling task, you will iterate over the training data for a fixed number of iterations like this:
-    for i, (xb, yb) in enumerate(train_LM_loader):
-        if i >= max_iters: # stop after 500 batches
-            break
-        xb, yb = xb.to(device), yb.to(device)
-        if (i+1)%100 == 0:
-            print("Train Data Perplexity At Iteration ", (i+1), compute_perplexity(decoder, train_LM_loader))
-        # LM training code here
-        # evaluate the loss
-        logits, _ = decoder(xb)
-        B, T, C = logits.shape
-        logits = logits.view(B*T, C)
-        targets = yb.view(B*T)
-        loss = F.cross_entropy(logits, targets)
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-    # calculate perplexity
-    files = ['speechesdataset/test_LM_hbush.tsv', 'speechesdataset/test_LM_obama.txt', 'speechesdataset/test_LM_wbush.txt']
-    for file in files:
-        with open(file, 'r', encoding='utf-8') as f:
-            data = f.read()
-        test_LM_dataset = LanguageModelingDataset(tokenizer, data,  block_size)
-        test_LM_loader = DataLoader(test_LM_dataset, batch_size=batch_size, shuffle=True)
-        print(file, compute_perplexity(decoder, test_LM_loader))
-    # generate some text!
-    context = torch.zeros((1, 1), dtype=torch.long, device=device)
-    print(tokenizer.decode(decoder.generate(context, max_new_tokens=500)[0].tolist()))
-# ------------------------------MAIN---------------------------------- #
-def main():
-    parser = argparse.ArgumentParser(description="Run classifier or decoder")
-    parser.add_argument("-mode", choices=["c", "d", "sc", "sd", "ecd"], help="Choose mode: 'c' for classifier, 'd' for decoder, 'sc' for sanity checking classifier, 'sd' for sanity checking decoder, 'ecd' for decoder EC")
-    args = parser.parse_args()
-    if args.mode == "c":
-        run_classifier()
-    elif args.mode == "d":
-        run_decoder()
-    elif args.mode == 'sc':
-        run_sanity_check_encoder()
-    elif args.mode == 'sd':
-        run_sanity_check_decoder()
-    elif args.mode == 'ecd':
-        run_ec_decoder()
-    else:
-        print("Invalid mode.")
-if __name__ == "__main__":
-    main()