AdamBeedell
/

MLXHackerNews

PyTorch

Model card Files Files and versions

xet

Community

Adam Beedell commited on Jun 12, 2025

Commit

b4ee00a

1 Parent(s): a1093cf

please oh god work this time

Browse files

Files changed (2) hide show

ABModel.py +177 -0
pytorch_model.bin +3 -0

ABModel.py ADDED Viewed

	@@ -0,0 +1,177 @@

+## Adam's model .py
+###add dependancies
+import os
+import bz2
+import csv
+import torch
+print(torch.__version__)
+print(torch.cuda.is_available())  ## looking for True
+import torch.nn as NN
+import torch.nn.functional as F
+import torch.optim as optim
+from collections import deque
+import itertools
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+### hyperparameters
+windowsize = 2  # words either side of the target word
+windowsize = windowsize * 2 + 1
+split_ratio = 0.8  # 80% for training, 20% for testing
+embed_dim = 111
+### Goal - Import text8
+text8 = bz2.open('wikipedia_data.txt.bz2', 'rt').read()  # Read the text8 dataset from a bz2 compressed file   #### Not actually .bz2 at the moment, but this is how it will be in the future
+text8 = text8.split()  # Split the text into words
+text8.append('<unk>')  # Add an unknown token to the vocabulary
+#>>> len(text8)
+#17005207
+#>>> len(set(text8))
+#253854
+#
+# print(f"Number of words in text8: {len(text8)}")  # Uncomment to see the number of words in the dataset
+# print(f"First 10 words in text8: {text8[:10]}")  # Uncomment to see the first 10 words in the dataset
+# print(f"Distinct words in text8: {len(set(text8))}")  # Uncomment to see the number of distinct words in the dataset")
+### tokenize text8
+vocablist = set(text8)  ## deduping, not sure this is required
+vocabsize = len(vocablist)  # Number of unique words in the vocabulary
+word2idx = {w: i for i, w in enumerate(sorted(vocablist))} ## i sets an index, w is the word
+unk_idx = word2idx['<unk>']  # Index for the unknown token
+idx2word = {i: w for w, i in word2idx.items()}
+windows = list(zip(*[iter(text8)]*windowsize))  # Group words into batches of size batch_size
+#3401041
+split = int(len(windows) * split_ratio)  # Split the dataset into training and testing sets
+train_windows = windows[:split]
+test_windows = windows[split:]
+#train_dataset = text8[:len(text8)*0.8]  # 80% for training
+#test_dataset = text8[len(text8)*0.8:]  # 20% for testing
+#def train_generator(windows, word2idx, unk_idx):
+#    """Generator function to yield context and target pairs for training."""
+#    for w1, w2, w3, w4, w5 in windows:
+#        ctx = [word2idx.get(w, unk_idx) for w in (w1, w2, w4, w5)]
+#        yield torch.tensor(ctx), tgt
+#        tgt = word2idx.get(w3, unk_idx)
+#traintensors = train_generator(train_dataset, word2idx, unk_idx)
+#testtensors = train_generator(test_dataset, word2idx, unk_idx)
+class MaskedCBOWDataset(torch.utils.data.IterableDataset):
+    def __init__(self, windows, word2idx, unk_idx):
+        self.windows = windows
+        self.word2idx = word2idx
+        self.unk_idx = unk_idx
+    def __iter__(self):
+        for w1, w2, w3, w4, w5 in self.windows:
+            ctx = [self.word2idx.get(w, self.unk_idx) for w in (w1, w2, w4, w5)]
+            tgt = self.word2idx.get(w3, self.unk_idx)
+            yield torch.tensor(ctx), tgt
+#train_dataset = MaskedCBOWDataset(train_windows, word2idx, unk_idx)
+#train_loader = DataLoader(train_dataset, batch_size=128)
+# Example usage of the generator
+#for context, target in gen:
+#    print(context, target)
+#/eg
+### create model architecture
+# Create DataLoaders
+train_loader = torch.utils.data.DataLoader(
+    MaskedCBOWDataset(train_windows, word2idx, unk_idx),
+    batch_size=64,
+    #shuffle=True
+)
+test_loader = torch.utils.data.DataLoader(
+    MaskedCBOWDataset(test_windows, word2idx, unk_idx),
+    batch_size=64,
+    #shuffle=False
+)
+#print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
+for i, (context, target) in enumerate(train_loader):
+    print(f"Batch {i}:")
+    print(f"  Context shape: {context.shape}")  # expect [batch_size, 4]
+    print(f"  Target shape:  {target.shape}")   # expect [batch_size]
+    print(f"  First row:     {context[0].tolist()} → {target[0].item()}")
+    if i == 2: break  # only show a few batches
+### create model
+class word2vec(NN.Module):   ### This creates a class for our specific NN, inheriting from the pytorch equivalent
+    def __init__(self):
+        super().__init__()  ## super goes up one level to the torch NN module, and initializes the net
+        self.emb = NN.Embedding(vocabsize, embed_dim)  # 111 to be different
+        self.out = NN.Linear(embed_dim, vocabsize)     # predict vocab word from averaged context
+    def forward(self, x):  # x: [batch, 4]
+        x = self.emb(x)           # → [batch, 4, embed_dim]
+        x = x.mean(dim=1)         # → [batch, embed_dim]  ← averaging context vectors
+        x = F.relu(x)             # optional, but can help
+        x = self.out(x)           # → [batch, vocab_size]
+        return x                  # raw logits
+loss_function = NN.CrossEntropyLoss()  # using built-in loss function
+model = word2vec().to(device) ##create the model as described above
+optimizer = optim.Adam(model.parameters(), lr=0.001) ### lr = learning rate, 0.001 is apparently a "normal" value. Adam is the optimizer chosen, also fairly default
+##### do training
+num_epochs = 1 ## passes through the dataset
+for epoch in range(num_epochs):
+    for context, target in train_loader: #note uses batches defined earlier
+        context = context.to(device)  # move data to the selected device
+        target = target.to(device)    # move data to the selected device
+        optimizer.zero_grad() ### reset gradients each time
+        outputs = model(context) # forward pass
+        loss = loss_function(outputs, target)
+        loss.backward() ## backprop method created by pytorch crossentropyloss function, very convenient
+        optimizer.step()
+        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
+### output weights
+torch.save(model.state_dict(), "ABembeddingsweights.pth")
+torch.save(model, "ABembeddingsfullmodel.pth")
+### / training
+### train model

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3131deb251e881efb6f4d60f5c894b3334eab20d3ce828db674ad96d67b9062
+size 226442143