Spaces:
Build error
Build error
| import torch, torchaudio | |
| import gradio as gr | |
| import time | |
| from hifigan.generator import HifiganGenerator | |
| from acoustic import AcousticModel | |
| from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present | |
| hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu() | |
| acoustic = AcousticModel(False, True) | |
| checkpoint = torch.load("models/acoustic-model-best.pt", map_location=torch.device('cpu')) | |
| consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.") | |
| acoustic.load_state_dict(checkpoint["acoustic-model"]) | |
| acoustic.eval() | |
| #hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft").cpu()#.cuda() | |
| hifigan = HifiganGenerator() | |
| checkpoint = torch.load("models/hifigan-model-best.pt", map_location=torch.device('cpu')) | |
| consume_prefix_in_state_dict_if_present(checkpoint["generator"]["model"], "module.") | |
| hifigan.load_state_dict(checkpoint["generator"]["model"]) | |
| hifigan.eval() | |
| def run_conversion(audio_in): | |
| sr, source = audio_in | |
| source = torch.Tensor(source) | |
| if source.dim() == 1: | |
| source = source.unsqueeze(1) | |
| source = source.T | |
| #resample to 16khz | |
| source = torchaudio.functional.resample(source, sr, 16000) | |
| #convert to mono | |
| source = torch.mean(source, dim=0).unsqueeze(0) | |
| source = source.unsqueeze(0) | |
| with torch.inference_mode(): | |
| time_start = time.perf_counter() | |
| # Extract speech units | |
| units = hubert.units(source) | |
| # Generate target spectrogram | |
| mel = acoustic.generate(units).transpose(1, 2) | |
| # Generate audio waveform | |
| target = hifigan(mel) | |
| result = target.squeeze().cpu().multiply(32767).to(torch.int16).numpy() | |
| time_end = time.perf_counter() | |
| time_elapsed = time_end - time_start | |
| print(f"Conversion finished in {time_elapsed} Seconds") | |
| return (16000, result) | |
| with gr.Blocks() as demo: | |
| with gr.Column(): | |
| gr.Markdown( | |
| """ | |
| # Soft-VC | Widowmaker | |
| This is a [Soft-VC model](https://github.com/bshall/soft-vc) trained on Widowmaker from Overwatch, allowing the conversion of any voice to Widowmaker's voice. While lower quality (16kHz), it captures the character fairly well, imo. | |
| For a multi-speaker model, check out my [sovits-overwatch2](https://huggingface.co/spaces/cjayic/sovits-overwatch2) space! | |
| The acoustic model has been trained for around 100k iterations, the HiFiGAN-Model for around 150k iterations. Quality could likely be improved by training the HiFiGAN further. | |
| """), | |
| with gr.Column(): | |
| with gr.Tab("Upload Audio File"): | |
| with gr.Column(): | |
| input_audio = gr.Audio( | |
| label="Audio to be converted", | |
| ).style( | |
| container=False, | |
| ) | |
| btn_upload = gr.Button("Widowify", variant="primary").style(full_width=True) | |
| with gr.Tab("Record Audio"): | |
| with gr.Column(): | |
| input_audio_record = gr.Audio( | |
| label="Audio to be converted", | |
| source="microphone" | |
| ).style( | |
| container=False, | |
| ) | |
| btn_rec = gr.Button("Widowify", variant="primary").style(full_width=True) | |
| with gr.Row(): | |
| output_audio = gr.Audio( | |
| label="Converted Audio", | |
| elem_id="output_audio", | |
| interactive=False | |
| ).style(height="auto") | |
| btn_upload.click(run_conversion, [input_audio], output_audio) | |
| btn_rec.click(run_conversion, [input_audio_record], output_audio) | |
| gr.Examples( | |
| ["examples/jermacraft.wav","examples/Mercy_0000000B0F5.wav","examples/weartie.wav","examples/gman_02.wav"], inputs=[input_audio], | |
| outputs=[output_audio], | |
| fn=run_conversion, | |
| cache_examples=True, | |
| run_on_click=True | |
| ) | |
| demo.queue() | |
| demo.launch() | |