| | import csv |
| | import torch |
| | from torchtext.vocab import build_vocab_from_iterator |
| |
|
| |
|
| | class TextPreProcessor: |
| | def __init__(self, input_file): |
| | self.input_file = input_file |
| | self.context_size = 1 |
| |
|
| | def build_training_data(self): |
| | data = [] |
| | for row in self._generate_rows(): |
| | for i in range(self.context_size, len(row) - self.context_size): |
| | before = row[i - 1].lower() |
| | target = row[i].lower() |
| | after = row[i + 1].lower() |
| |
|
| | context_one = [before, after] |
| | context_two = [after, before] |
| | data.append((context_one, target)) |
| | data.append((context_two, target)) |
| |
|
| | return data |
| |
|
| | def build_vocab(self): |
| | rows_of_artists = self._generate_rows() |
| | our_vocab = build_vocab_from_iterator( |
| | rows_of_artists, specials=["<unk>"], min_freq=1 |
| | ) |
| |
|
| | return our_vocab |
| |
|
| | def _generate_rows(self): |
| | with open(self.input_file, encoding="utf-8") as f: |
| | reader = csv.reader(f) |
| | for row in reader: |
| | yield row |
| |
|
| |
|
| | class CBOW(torch.nn.Module): |
| | def __init__(self, vocab): |
| | super(CBOW, self).__init__() |
| | self.num_epochs = 3 |
| | self.context_size = 1 |
| | self.embedding_dim = 100 |
| | self.learning_rate = 0.001 |
| | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
|
| | self.vocab = vocab |
| | self.word_to_ix = self.vocab.get_stoi() |
| | self.ix_to_word = self.vocab.get_itos() |
| | self.vocab_list = list(self.vocab.get_stoi().keys()) |
| | self.vocab_size = len(self.vocab) |
| |
|
| | self.model = None |
| |
|
| | |
| | |
| | self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim) |
| | self.linear1 = torch.nn.Linear(self.embedding_dim, 128) |
| | self.activation_function1 = torch.nn.ReLU() |
| |
|
| | |
| | self.linear2 = torch.nn.Linear(128, self.vocab_size) |
| | self.activation_function2 = torch.nn.LogSoftmax(dim=-1) |
| |
|
| | def forward(self, inputs): |
| | embeds = sum(self.embeddings(inputs)).view(1, -1) |
| | out = self.linear1(embeds) |
| | out = self.activation_function1(out) |
| | out = self.linear2(out) |
| | out = self.activation_function2(out) |
| | return out |
| |
|
| | def get_word_emdedding(self, word): |
| | word = torch.tensor([self.word_to_ix[word]]) |
| | |
| | |
| | return self.embeddings(word).view(1, -1) |
| |
|
| |
|
| | def make_context_vector(context, word_to_ix): |
| | idxs = [word_to_ix[w] for w in context] |
| | return torch.tensor(idxs, dtype=torch.long) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | artist_names = "data/artist-names-per-row.csv" |
| | model_path = "data/cbow-model-weights" |
| | text = TextPreProcessor(artist_names) |
| | training_data = text.build_training_data() |
| | vocab = text.build_vocab() |
| | cbow = CBOW(vocab) |
| |
|
| | loss_function = torch.nn.NLLLoss() |
| | optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001) |
| |
|
| | |
| | for epoch in range(50): |
| | |
| | total_loss = 0 |
| |
|
| | |
| | for context, target in training_data: |
| | context_vector = make_context_vector(context, cbow.word_to_ix) |
| |
|
| | |
| | log_probs = cbow(context_vector) |
| |
|
| | |
| | |
| | total_loss += loss_function( |
| | log_probs, torch.tensor([cbow.word_to_ix[target]]) |
| | ) |
| |
|
| | |
| | optimizer.zero_grad() |
| | total_loss.backward() |
| | optimizer.step() |
| |
|
| | |
| | print("end of epoch {} | loss {:2.3f}".format(epoch, total_loss)) |
| | torch.save(cbow.state_dict(), model_path) |
| | print("saved model!") |
| |
|