File size: 4,192 Bytes
628e563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import csv
import torch
from torchtext.vocab import build_vocab_from_iterator


class TextPreProcessor:
    def __init__(self, input_file):
        self.input_file = input_file
        self.context_size = 1

    def build_training_data(self):
        data = []
        for row in self._generate_rows():
            for i in range(self.context_size, len(row) - self.context_size):
                before = row[i - 1].lower()
                target = row[i].lower()
                after = row[i + 1].lower()

                context_one = [before, after]
                context_two = [after, before]
                data.append((context_one, target))
                data.append((context_two, target))

        return data

    def build_vocab(self):
        rows_of_artists = self._generate_rows()
        our_vocab = build_vocab_from_iterator(
            rows_of_artists, specials=["<unk>"], min_freq=1
        )

        return our_vocab

    def _generate_rows(self):
        with open(self.input_file, encoding="utf-8") as f:
            reader = csv.reader(f)
            for row in reader:
                yield row


class CBOW(torch.nn.Module):
    def __init__(self, vocab):
        super(CBOW, self).__init__()
        self.num_epochs = 3
        self.context_size = 1  # 1 word to the left, 1 to the right
        self.embedding_dim = 100  # embedding vector size
        self.learning_rate = 0.001
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.vocab = vocab
        self.word_to_ix = self.vocab.get_stoi()
        self.ix_to_word = self.vocab.get_itos()
        self.vocab_list = list(self.vocab.get_stoi().keys())
        self.vocab_size = len(self.vocab)

        self.model = None

        # out: 1 x embedding_dim
        # initialize an Embedding matrix based on our inputs
        self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim)
        self.linear1 = torch.nn.Linear(self.embedding_dim, 128)
        self.activation_function1 = torch.nn.ReLU()

        # out: 1 x vocab_size
        self.linear2 = torch.nn.Linear(128, self.vocab_size)
        self.activation_function2 = torch.nn.LogSoftmax(dim=-1)

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1, -1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([self.word_to_ix[word]])
        # Embeddings lookup of a single word,
        # once the Embeddings layer has been optimized
        return self.embeddings(word).view(1, -1)


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


if __name__ == "__main__":
    artist_names = "data/artist-names-per-row.csv"
    model_path = "data/cbow-model-weights"
    text = TextPreProcessor(artist_names)
    training_data = text.build_training_data()
    vocab = text.build_vocab()
    cbow = CBOW(vocab)

    loss_function = torch.nn.NLLLoss()
    optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

    # 50 to start with, no correct answer here
    for epoch in range(50):
        # we start tracking how accurate our intial words are
        total_loss = 0

        # for the x, y in the training data:
        for context, target in training_data:
            context_vector = make_context_vector(context, cbow.word_to_ix)

            # we look at loss
            log_probs = cbow(context_vector)

            # we compare the loss from what the actual word is, related to the
            # probaility of the words
            total_loss += loss_function(
                log_probs, torch.tensor([cbow.word_to_ix[target]])
            )

        # optimize at the end of each epoch
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Log out some metrics to see if loss decreases
        print("end of epoch {} | loss {:2.3f}".format(epoch, total_loss))
    torch.save(cbow.state_dict(), model_path)
    print("saved model!")