uzdzn
/

voice_conversion

Model card Files Files and versions

voice_conversion / inference.py

uzdzn's picture

Upload 5 files

199c8dd verified almost 2 years ago

history blame contribute delete

1.93 kB

	import torch
	import torchaudio
	import numpy as np
	from decoder_base import AcousticModel

	class InferencePipeline():
	def __init__(self):
	# download hubert content encoder
	self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)#.cuda()

	# initialize decoder with checkpoint
	ckpts_path = 'model-best.pt'
	self.model = AcousticModel()
	cp = torch.load(ckpts_path, map_location=torch.device('cpu'))
	self.model.load_state_dict(cp['acoustic-model'])

	# download vocoder
	self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))

	# load source audio
	#self.source, sr = torchaudio.load("test.wav")
	#self.source = torchaudio.functional.resample(self.source, sr, 16000)
	#self.source = self.source.unsqueeze(0)#.cuda()

	# load target speaker embedding
	self.trg_spk_emb = np.load('content/vctk/spk_emb/p226/p226_322_mic1.npy')
	self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb)
	self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda()

	def voice_conversion(self, audio_file_path):
	# run inference
	self.model.eval()
	with torch.inference_mode():
	# Extract speech units
	units = self.hubert.units(audio_file_path)
	# Generate target spectrogram
	mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2)
	# Generate audio waveform
	target = self.hifigan(mel)

	# Assuming `target` is a tensor with the audio waveform
	# Convert it to numpy array and save it as an output audio file
	output_audio_path = "output.wav"
	torchaudio.save(output_audio_path, target.cpu(), sample_rate=16000)

	return output_audio_path

	#torchaudio.save("output.wav", target.squeeze(0), 16000)