cjayic's picture
fix performance log
da41dee
import torch, torchaudio
import gradio as gr
import time
from hifigan.generator import HifiganGenerator
from acoustic import AcousticModel
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu()
acoustic = AcousticModel(False, True)
checkpoint = torch.load("models/acoustic-model-best.pt", map_location=torch.device('cpu'))
consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
acoustic.load_state_dict(checkpoint["acoustic-model"])
acoustic.eval()
#hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft").cpu()#.cuda()
hifigan = HifiganGenerator()
checkpoint = torch.load("models/hifigan-model-best.pt", map_location=torch.device('cpu'))
consume_prefix_in_state_dict_if_present(checkpoint["generator"]["model"], "module.")
hifigan.load_state_dict(checkpoint["generator"]["model"])
hifigan.eval()
def run_conversion(audio_in):
sr, source = audio_in
source = torch.Tensor(source)
if source.dim() == 1:
source = source.unsqueeze(1)
source = source.T
#resample to 16khz
source = torchaudio.functional.resample(source, sr, 16000)
#convert to mono
source = torch.mean(source, dim=0).unsqueeze(0)
source = source.unsqueeze(0)
with torch.inference_mode():
time_start = time.perf_counter()
# Extract speech units
units = hubert.units(source)
# Generate target spectrogram
mel = acoustic.generate(units).transpose(1, 2)
# Generate audio waveform
target = hifigan(mel)
result = target.squeeze().cpu().multiply(32767).to(torch.int16).numpy()
time_end = time.perf_counter()
time_elapsed = time_end - time_start
print(f"Conversion finished in {time_elapsed} Seconds")
return (16000, result)
with gr.Blocks() as demo:
with gr.Column():
gr.Markdown(
"""
# Soft-VC | Widowmaker
This is a [Soft-VC model](https://github.com/bshall/soft-vc) trained on Widowmaker from Overwatch, allowing the conversion of any voice to Widowmaker's voice. While lower quality (16kHz), it captures the character fairly well, imo.
For a multi-speaker model, check out my [sovits-overwatch2](https://huggingface.co/spaces/cjayic/sovits-overwatch2) space!
The acoustic model has been trained for around 100k iterations, the HiFiGAN-Model for around 150k iterations. Quality could likely be improved by training the HiFiGAN further.
"""),
with gr.Column():
with gr.Tab("Upload Audio File"):
with gr.Column():
input_audio = gr.Audio(
label="Audio to be converted",
).style(
container=False,
)
btn_upload = gr.Button("Widowify", variant="primary").style(full_width=True)
with gr.Tab("Record Audio"):
with gr.Column():
input_audio_record = gr.Audio(
label="Audio to be converted",
source="microphone"
).style(
container=False,
)
btn_rec = gr.Button("Widowify", variant="primary").style(full_width=True)
with gr.Row():
output_audio = gr.Audio(
label="Converted Audio",
elem_id="output_audio",
interactive=False
).style(height="auto")
btn_upload.click(run_conversion, [input_audio], output_audio)
btn_rec.click(run_conversion, [input_audio_record], output_audio)
gr.Examples(
["examples/jermacraft.wav","examples/Mercy_0000000B0F5.wav","examples/weartie.wav","examples/gman_02.wav"], inputs=[input_audio],
outputs=[output_audio],
fn=run_conversion,
cache_examples=True,
run_on_click=True
)
demo.queue()
demo.launch()