File size: 4,192 Bytes

628e563

import csv
import torch
from torchtext.vocab import build_vocab_from_iterator


class TextPreProcessor:
    def __init__(self, input_file):
        self.input_file = input_file
        self.context_size = 1

    def build_training_data(self):
        data = []
        for row in self._generate_rows():
            for i in range(self.context_size, len(row) - self.context_size):
                before = row[i - 1].lower()
                target = row[i].lower()
                after = row[i + 1].lower()

                context_one = [before, after]
                context_two = [after, before]
                data.append((context_one, target))
                data.append((context_two, target))

        return data

    def build_vocab(self):
        rows_of_artists = self._generate_rows()
        our_vocab = build_vocab_from_iterator(
            rows_of_artists, specials=["<unk>"], min_freq=1
        )

        return our_vocab

    def _generate_rows(self):
        with open(self.input_file, encoding="utf-8") as f:
            reader = csv.reader(f)
            for row in reader:
                yield row


class CBOW(torch.nn.Module):
    def __init__(self, vocab):
        super(CBOW, self).__init__()
        self.num_epochs = 3
        self.context_size = 1  # 1 word to the left, 1 to the right
        self.embedding_dim = 100  # embedding vector size
        self.learning_rate = 0.001
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.vocab = vocab
        self.word_to_ix = self.vocab.get_stoi()
        self.ix_to_word = self.vocab.get_itos()
        self.vocab_list = list(self.vocab.get_stoi().keys())
        self.vocab_size = len(self.vocab)

        self.model = None

        # out: 1 x embedding_dim
        # initialize an Embedding matrix based on our inputs
        self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim)
        self.linear1 = torch.nn.Linear(self.embedding_dim, 128)
        self.activation_function1 = torch.nn.ReLU()

        # out: 1 x vocab_size
        self.linear2 = torch.nn.Linear(128, self.vocab_size)
        self.activation_function2 = torch.nn.LogSoftmax(dim=-1)

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1, -1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([self.word_to_ix[word]])
        # Embeddings lookup of a single word,
        # once the Embeddings layer has been optimized
        return self.embeddings(word).view(1, -1)


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


if __name__ == "__main__":
    artist_names = "data/artist-names-per-row.csv"
    model_path = "data/cbow-model-weights"
    text = TextPreProcessor(artist_names)
    training_data = text.build_training_data()
    vocab = text.build_vocab()
    cbow = CBOW(vocab)

    loss_function = torch.nn.NLLLoss()
    optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

    # 50 to start with, no correct answer here
    for epoch in range(50):
        # we start tracking how accurate our intial words are
        total_loss = 0

        # for the x, y in the training data:
        for context, target in training_data:
            context_vector = make_context_vector(context, cbow.word_to_ix)

            # we look at loss
            log_probs = cbow(context_vector)

            # we compare the loss from what the actual word is, related to the
            # probaility of the words
            total_loss += loss_function(
                log_probs, torch.tensor([cbow.word_to_ix[target]])
            )

        # optimize at the end of each epoch
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Log out some metrics to see if loss decreases
        print("end of epoch {} | loss {:2.3f}".format(epoch, total_loss))
    torch.save(cbow.state_dict(), model_path)
    print("saved model!")