Spaces:

cjayic
/

soft-vc-widowmaker

Build error

App Files Files Community

soft-vc-widowmaker / app.py

cjayic

fix performance log

da41dee almost 3 years ago

raw

history blame contribute delete

4.08 kB

	import torch, torchaudio
	import gradio as gr
	import time

	from hifigan.generator import HifiganGenerator

	from acoustic import AcousticModel

	from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present

	hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu()

	acoustic = AcousticModel(False, True)

	checkpoint = torch.load("models/acoustic-model-best.pt", map_location=torch.device('cpu'))

	consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
	acoustic.load_state_dict(checkpoint["acoustic-model"])
	acoustic.eval()

	#hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft").cpu()#.cuda()

	hifigan = HifiganGenerator()
	checkpoint = torch.load("models/hifigan-model-best.pt", map_location=torch.device('cpu'))
	consume_prefix_in_state_dict_if_present(checkpoint["generator"]["model"], "module.")
	hifigan.load_state_dict(checkpoint["generator"]["model"])
	hifigan.eval()

	def run_conversion(audio_in):
	sr, source = audio_in
	source = torch.Tensor(source)

	if source.dim() == 1:
	source = source.unsqueeze(1)

	source = source.T

	#resample to 16khz
	source = torchaudio.functional.resample(source, sr, 16000)

	#convert to mono
	source = torch.mean(source, dim=0).unsqueeze(0)
	source = source.unsqueeze(0)

	with torch.inference_mode():
	time_start = time.perf_counter()

	# Extract speech units
	units = hubert.units(source)
	# Generate target spectrogram
	mel = acoustic.generate(units).transpose(1, 2)
	# Generate audio waveform
	target = hifigan(mel)

	result = target.squeeze().cpu().multiply(32767).to(torch.int16).numpy()

	time_end = time.perf_counter()
	time_elapsed = time_end - time_start

	print(f"Conversion finished in {time_elapsed} Seconds")

	return (16000, result)



	with gr.Blocks() as demo:
	with gr.Column():
	gr.Markdown(
	"""
	# Soft-VC \| Widowmaker
	This is a [Soft-VC model](https://github.com/bshall/soft-vc) trained on Widowmaker from Overwatch, allowing the conversion of any voice to Widowmaker's voice. While lower quality (16kHz), it captures the character fairly well, imo.

	For a multi-speaker model, check out my [sovits-overwatch2](https://huggingface.co/spaces/cjayic/sovits-overwatch2) space!

	The acoustic model has been trained for around 100k iterations, the HiFiGAN-Model for around 150k iterations. Quality could likely be improved by training the HiFiGAN further.
	"""),
	with gr.Column():
	with gr.Tab("Upload Audio File"):
	with gr.Column():
	input_audio = gr.Audio(
	label="Audio to be converted",
	).style(
	container=False,
	)
	btn_upload = gr.Button("Widowify", variant="primary").style(full_width=True)
	with gr.Tab("Record Audio"):
	with gr.Column():
	input_audio_record = gr.Audio(
	label="Audio to be converted",
	source="microphone"
	).style(
	container=False,
	)
	btn_rec = gr.Button("Widowify", variant="primary").style(full_width=True)

	with gr.Row():
	output_audio = gr.Audio(
	label="Converted Audio",
	elem_id="output_audio",
	interactive=False
	).style(height="auto")

	btn_upload.click(run_conversion, [input_audio], output_audio)
	btn_rec.click(run_conversion, [input_audio_record], output_audio)

	gr.Examples(
	["examples/jermacraft.wav","examples/Mercy_0000000B0F5.wav","examples/weartie.wav","examples/gman_02.wav"], inputs=[input_audio],
	outputs=[output_audio],
	fn=run_conversion,
	cache_examples=True,
	run_on_click=True
	)

	demo.queue()
	demo.launch()