OpenVoice

Sleeping

App Files Files Community

OpenVoice / openvoice /api.py

Chuatury

upgrade

028cdeb unverified 7 months ago

raw

history blame contribute delete

3.51 kB

	import torch
	import soundfile
	from openvoice import utils
	from openvoice import commons
	import os
	import librosa
	from openvoice.mel_processing import spectrogram_torch
	from openvoice.models import SynthesizerTrn


	class OpenVoiceBaseClass(object):
	def __init__(self, config_path, device="cuda:0"):
	if "cuda" in device:
	assert torch.cuda.is_available()

	hps = utils.get_hparams_from_file(config_path)

	model = SynthesizerTrn(
	len(getattr(hps, "symbols", [])),
	hps.data.filter_length // 2 + 1,
	n_speakers=hps.data.n_speakers,
	**hps.model,
	).to(device)

	model.eval()
	self.model = model
	self.hps = hps
	self.device = device

	def load_ckpt(self, ckpt_path):
	checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
	a, b = self.model.load_state_dict(checkpoint_dict["model"], strict=False)
	print("Loaded checkpoint '{}'".format(ckpt_path))
	print("missing/unexpected keys:", a, b)


	class ToneColorConverter(OpenVoiceBaseClass):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	self.version = getattr(self.hps, "_version_", "v1")

	def extract_se(self, ref_wav_list, se_save_path=None):
	# if isinstance(ref_wav_list, str):
	# ref_wav_list = [ref_wav_list]

	device = self.device
	hps = self.hps
	gs = []

	for fname in ref_wav_list:
	audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
	y = torch.FloatTensor(audio_ref)
	y = y.to(device)
	y = y.unsqueeze(0)
	y = spectrogram_torch(
	y,
	hps.data.filter_length,
	hps.data.sampling_rate,
	hps.data.hop_length,
	hps.data.win_length,
	center=False,
	).to(device)
	with torch.no_grad():
	g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
	gs.append(g.detach())
	gs = torch.stack(gs).mean(0)

	if se_save_path is not None:
	os.makedirs(os.path.dirname(se_save_path), exist_ok=True)
	torch.save(gs.cpu(), se_save_path)

	return gs

	def convert(
	self,
	audio_src_path,
	src_se,
	tgt_se,
	output_path=None,
	tau=0.3,
	):
	hps = self.hps
	# load audio
	audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
	audio = torch.tensor(audio).float()

	with torch.no_grad():
	y = torch.FloatTensor(audio).to(self.device)
	y = y.unsqueeze(0)
	spec = spectrogram_torch(
	y,
	hps.data.filter_length,
	hps.data.sampling_rate,
	hps.data.hop_length,
	hps.data.win_length,
	center=False,
	).to(self.device)
	spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)

	audio = (
	self.model.voice_conversion(
	spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau
	)[0][0, 0]
	.data.cpu()
	.float()
	.numpy()
	)

	if output_path is None:
	return audio
	else:
	soundfile.write(output_path, audio, hps.data.sampling_rate)