Initial commit

0a97d6c over 1 year ago

1.3 kB

	from transformers import HubertModel
	import torch.nn as nn
	import torch
	import torch.nn.functional as F
	import librosa


	class HubertModelWithFinalProj(HubertModel):
	def __init__(self, config):
	super().__init__(config)

	# The final projection layer is only used for backward compatibility.
	# Following https://github.com/auspicious3000/contentvec/issues/6
	# Remove this layer is necessary to achieve the desired outcome.
	self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)


	def get_content_model(config='lengyue233/content-vec-best'):
	model = HubertModelWithFinalProj.from_pretrained(config)
	model.eval()
	return model


	@torch.no_grad()
	def get_content(model, wav_16k_tensor, device='cuda'):
	# print(layer)
	wav_16k_tensor = wav_16k_tensor.to(device)
	# so that the output shape will be len(audio//320)
	wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
	logits = model(wav_16k_tensor)['last_hidden_state']
	return logits


	if __name__ == '__main__':
	model = get_content_model().cuda()
	audio, sr = librosa.load('test.wav', sr=16000)
	audio = audio[:100*320]
	audio = torch.tensor([audio])
	content = get_content(model, audio, 'cuda')
	print(content)