Spaces:

lord-reso
/

host

Sleeping

App Files Files Community

host / speaker /model.py

lord-reso

Update speaker/model.py

43db691 verified almost 2 years ago

raw

history blame contribute delete

7.45 kB

	from torch import nn
	import numpy as np
	import torch
	from torch.nn.utils import clip_grad_norm_

	class SpeakerEncoder(nn.Module):
	""" Learn speaker representation from speech utterance of arbitrary lengths.
	"""
	def __init__(self, device, loss_device):
	super().__init__()
	self.loss_device = loss_device

	# lstm block consisting of 3 layers
	# takes input 80 channel log-mel spectrograms, projected to 256 dimensions
	self.lstm = nn.LSTM(
	input_size=80,
	hidden_size=256,
	num_layers=3,
	batch_first=True,
	dropout=0,
	bidirectional=False
	).to(device)

	self.linear = nn.Linear(in_features=256, out_features=256).to(device)
	self.relu = nn.ReLU().to(device)
	# epsilon term for numerical stability ( ie - division by 0)
	self.epsilon = 1e-5

	#Cosine similarity weights
	self.sim_weight = nn.Parameter(torch.tensor([5.])).to(loss_device)
	self.sim_bias = nn.Parameter(torch.tensor([-1.])).to(loss_device)

	def forward(self, utterances, h_init=None, c_init=None):
	# implement section 2.1 from https://arxiv.org/pdf/1806.04558.pdf
	if h_init is None or c_init is None:
	out, (hidden, cell) = self.lstm(utterances)
	else:
	out, (hidden, cell) = self.lstm(utterances, (h_init, c_init))

	# compute speaker embedding from hidden state of final layer
	final_hidden = hidden[-1]
	speaker_embedding = self.relu(self.linear(final_hidden))

	# l2 norm of speaker embedding
	speaker_embedding = speaker_embedding / (torch.norm(speaker_embedding, dim=1, keepdim=True) + self.epsilon)
	return speaker_embedding

	def gradient_clipping(self):
	self.sim_weight.grad *= 0.01
	self.sim_bias.grad *= 0.01

	#Pytorch to clip gradients if norm greater than max
	clip_grad_norm_(self.parameters(),max_norm=3,norm_type=2)

	def similarity_matrix(self, embeds, debug=False):
	# calculate s_ji,k from section 2.1 of GE2E paper
	# output matrix is cosine similarity between each utterance x centroid of each speaker
	# embeds input size: (speakers, utterances, embedding size)

	# Speaker centroids
	# Equal to average of utterance embeddings for the speaker
	# Used for neg examples (utterance comparing to false speaker)
	# Equation 1 in paper
	# size: (speakers, 1, embedding size)
	speaker_centroid = torch.mean(embeds,dim=1,keepdim=True)

	# Utterance exclusive centroids
	# Equal to average of utterance embeddings for the speaker, excluding ith utterance
	# Used for pos samples (utterance comparing to true speaker; speaker centroid exludes the utterance)
	# Equation 8 in paper
	# size: (speakers, utterances, embedding size)
	num_utterance = embeds.shape[1]
	utter_ex_centroid = (torch.sum(embeds,dim=1,keepdim=True) - embeds) / (num_utterance-1)

	if debug:
	print("e",embeds.shape)
	print(embeds)
	print("sc",speaker_centroid.shape)
	print(speaker_centroid)
	print("uc",utter_ex_centroid.shape)
	print(utter_ex_centroid)

	# Create pos and neg masks
	num_speaker = embeds.shape[0]
	i = torch.eye(num_speaker, dtype=torch.int)
	pos_mask = torch.where(i)
	neg_mask = torch.where(1-i)

	if debug:
	print("pm",len(pos_mask),len(pos_mask[0]))
	print(pos_mask)
	print("nm",len(neg_mask),len(neg_mask[0]))
	print(neg_mask)

	# Compile similarity matrix
	# size: (speakers, utterances, speakers)
	# initial size is (speakers, speakers, utterances for easier vectorization)
	sim_matrix = torch.zeros(num_speaker, num_speaker, num_utterance).to(self.loss_device)
	sim_matrix[pos_mask] = nn.functional.cosine_similarity(embeds,utter_ex_centroid,dim=2)
	sim_matrix[neg_mask] = nn.functional.cosine_similarity(embeds[neg_mask[0]],speaker_centroid[neg_mask[1]],dim=2)
	if debug:
	print("sm",sim_matrix.shape)
	print("pos vals",sim_matrix[pos_mask])
	print("neg vals",sim_matrix[neg_mask])
	print(sim_matrix)

	sim_matrix = sim_matrix.permute(0,2,1)

	if debug:
	print("sm",sim_matrix.shape)
	print(sim_matrix)
	print("cos sim weight", self.sim_weight)
	print("cos sim bias", self.sim_bias)

	# Apply weight / bias
	sim_matrix = sim_matrix * self.sim_weight + self.sim_bias
	return sim_matrix

	def softmax_loss(self, embeds):
	"""
	computes softmax loss as defined by equ 6 in the GE2E paper
	:param embeds: shape (speakers, utterances, embedding size)
	:return: computed softmax loss
	"""
	# per the GE2E paper, softmax loss as defined by equ 6
	# performs slightly better over Text-Independent Speaker
	# Verification tasks.
	# ref section 2.1 of the GE2E paper
	speaker_count = embeds.shape[0]

	# speaker, utterance, speaker
	similarities = self.similarity_matrix(embeds)

	# equ 6
	loss_matrix = -similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)] + \
	torch.log(torch.sum(torch.exp(similarities), dim=2))

	# equ 10
	return torch.sum(loss_matrix)

	def contrast_loss(self, embeds):
	"""
	computes contrast loss as defined by equ 7 in the GE2E paper
	:param embeds: shape (speakers, utterances, embedding size)
	:return: computed softmax loss
	"""
	# per the GE2E paper, contrast loss as defined by equ 7
	# performs slightly better over Text-Dependent Speaker
	# Verification tasks.
	# ref section 2.1 of the GE2E paper
	speaker_count, utterance_count = embeds.shape[0:2]

	# speaker, utterance, speaker
	similarities = self.similarity_matrix(embeds)

	# Janky indexing to resolve k != j
	mask = torch.ones(similarities.shape, dtype=torch.bool)
	mask[torch.arange(speaker_count), :, torch.arange(speaker_count)] = False
	closest_neighbors, _ = torch.max(similarities[mask].reshape(speaker_count, utterance_count, speaker_count - 1), dim=2)

	# Positive influence over matching embeddings
	matching_embedding = similarities[torch.arange(0, speaker_count), :, torch.arange(0, speaker_count)]

	# equ 7
	loss_matrix = 1 - torch.sigmoid(matching_embedding) + torch.sigmoid(closest_neighbors)

	# equ 10
	return torch.sum(loss_matrix)

	def accuracy(self, embeds):
	"""
	computes argmax accuracy
	:param embeds: shape (speakers, utterances, speakers)
	:return: accuracy
	"""
	num_speaker, num_utter = embeds.shape[:2]

	similarities = self.similarity_matrix(embeds)
	preds = torch.argmax(similarities, dim=2)
	preds_one_hot = torch.nn.functional.one_hot(preds,num_classes = num_speaker)

	actual = torch.arange(num_speaker).unsqueeze(1).repeat(1,num_utter)
	actual_one_hot = torch.nn.functional.one_hot(actual,num_classes=num_speaker)

	return torch.sum(preds_one_hot * actual_one_hot)/(num_speaker*num_utter)