tkell
/

tracklist-artist-to-vec

Model card Files Files and versions

tracklist-artist-to-vec / trainer.py

Thor Kell

add python code

628e563 over 2 years ago

history blame contribute delete

4.19 kB

	import csv
	import torch
	from torchtext.vocab import build_vocab_from_iterator


	class TextPreProcessor:
	def __init__(self, input_file):
	self.input_file = input_file
	self.context_size = 1

	def build_training_data(self):
	data = []
	for row in self._generate_rows():
	for i in range(self.context_size, len(row) - self.context_size):
	before = row[i - 1].lower()
	target = row[i].lower()
	after = row[i + 1].lower()

	context_one = [before, after]
	context_two = [after, before]
	data.append((context_one, target))
	data.append((context_two, target))

	return data

	def build_vocab(self):
	rows_of_artists = self._generate_rows()
	our_vocab = build_vocab_from_iterator(
	rows_of_artists, specials=["<unk>"], min_freq=1
	)

	return our_vocab

	def _generate_rows(self):
	with open(self.input_file, encoding="utf-8") as f:
	reader = csv.reader(f)
	for row in reader:
	yield row


	class CBOW(torch.nn.Module):
	def __init__(self, vocab):
	super(CBOW, self).__init__()
	self.num_epochs = 3
	self.context_size = 1 # 1 word to the left, 1 to the right
	self.embedding_dim = 100 # embedding vector size
	self.learning_rate = 0.001
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	self.vocab = vocab
	self.word_to_ix = self.vocab.get_stoi()
	self.ix_to_word = self.vocab.get_itos()
	self.vocab_list = list(self.vocab.get_stoi().keys())
	self.vocab_size = len(self.vocab)

	self.model = None

	# out: 1 x embedding_dim
	# initialize an Embedding matrix based on our inputs
	self.embeddings = torch.nn.Embedding(self.vocab_size, self.embedding_dim)
	self.linear1 = torch.nn.Linear(self.embedding_dim, 128)
	self.activation_function1 = torch.nn.ReLU()

	# out: 1 x vocab_size
	self.linear2 = torch.nn.Linear(128, self.vocab_size)
	self.activation_function2 = torch.nn.LogSoftmax(dim=-1)

	def forward(self, inputs):
	embeds = sum(self.embeddings(inputs)).view(1, -1)
	out = self.linear1(embeds)
	out = self.activation_function1(out)
	out = self.linear2(out)
	out = self.activation_function2(out)
	return out

	def get_word_emdedding(self, word):
	word = torch.tensor([self.word_to_ix[word]])
	# Embeddings lookup of a single word,
	# once the Embeddings layer has been optimized
	return self.embeddings(word).view(1, -1)


	def make_context_vector(context, word_to_ix):
	idxs = [word_to_ix[w] for w in context]
	return torch.tensor(idxs, dtype=torch.long)


	if __name__ == "__main__":
	artist_names = "data/artist-names-per-row.csv"
	model_path = "data/cbow-model-weights"
	text = TextPreProcessor(artist_names)
	training_data = text.build_training_data()
	vocab = text.build_vocab()
	cbow = CBOW(vocab)

	loss_function = torch.nn.NLLLoss()
	optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

	# 50 to start with, no correct answer here
	for epoch in range(50):
	# we start tracking how accurate our intial words are
	total_loss = 0

	# for the x, y in the training data:
	for context, target in training_data:
	context_vector = make_context_vector(context, cbow.word_to_ix)

	# we look at loss
	log_probs = cbow(context_vector)

	# we compare the loss from what the actual word is, related to the
	# probaility of the words
	total_loss += loss_function(
	log_probs, torch.tensor([cbow.word_to_ix[target]])
	)

	# optimize at the end of each epoch
	optimizer.zero_grad()
	total_loss.backward()
	optimizer.step()

	# Log out some metrics to see if loss decreases
	print("end of epoch {} \| loss {:2.3f}".format(epoch, total_loss))
	torch.save(cbow.state_dict(), model_path)
	print("saved model!")