Spaces:
Build error
Build error
| import os | |
| import json | |
| import math | |
| import torch | |
| import torchaudio | |
| from torch import nn | |
| from torch.nn import functional as F | |
| from torch.utils.data import DataLoader | |
| import commons | |
| import utils | |
| from data_utils import UnitAudioLoader, UnitAudioCollate | |
| from models import SynthesizerTrn | |
| import gradio | |
| hubert = torch.hub.load("bshall/hubert:main", "hubert_soft") | |
| hps = utils.get_hparams_from_file("configs/sovits_ow2.json") | |
| net_g = SynthesizerTrn( | |
| hps.data.filter_length // 2 + 1, | |
| hps.train.segment_size // hps.data.hop_length, | |
| n_speakers=hps.data.n_speakers, | |
| **hps.model) | |
| _ = net_g.eval() | |
| _ = utils.load_checkpoint("logs/ow2/G_195000.pth", net_g, None) | |
| def infer(md, mic_audio, audio, speaker_id, pitch_shift, length_scale, noise_scale=.667, noise_scale_w=0.8): | |
| source = None | |
| sr = None | |
| if mic_audio: | |
| sr, source = mic_audio | |
| source = torch.Tensor(source) | |
| if source.dim() == 1: | |
| source = source.unsqueeze(1) | |
| source = source.T | |
| if audio: | |
| source, sr = torchaudio.load(audio) | |
| source = torchaudio.functional.pitch_shift(source, sr, int(pitch_shift))#, n_fft=256) | |
| source = torchaudio.functional.resample(source, sr, 16000) | |
| source = torch.mean(source, dim=0).unsqueeze(0) | |
| source = source.unsqueeze(0) | |
| with torch.inference_mode(): | |
| # Extract speech units | |
| unit = hubert.units(source) | |
| unit_lengths = torch.LongTensor([unit.size(1)]) | |
| # for multi-speaker inference | |
| sid = torch.LongTensor([speaker_id]) | |
| # Synthesize audio | |
| audio_out = net_g.infer(unit, unit_lengths, sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.float().numpy() | |
| return (22050, audio_out) | |
| demo = gradio.Interface( | |
| fn=infer, | |
| inputs=[ | |
| gradio.Markdown( | |
| """ | |
| # SOVITS Any-to-Many VC | Overwatch 2 | |
| Upload any voice recording and turn it into a mangled approximation of any* Overwatch 2 Hero! | |
| For a higher quality single-speaker model, check out my [soft-vc-widowmaker](https://huggingface.co/spaces/cjayic/soft-vc-widowmaker) space! | |
| SOVITS doesn't really appear to adjust the pitch to the target speaker, so it helps to have your input voice at a similar pitch to the target voice. | |
| I added a pitch shift option to preprocess the input voice, but it's slow and sometimes outright broken, use at your own risk. | |
| ( * up to Kiriko and without Bastion. Please forgive. ) | |
| """), | |
| gradio.Audio(label="Record Input Audio", source="microphone"), | |
| gradio.Audio(label="Upload Input Audio", type="filepath"), | |
| gradio.Dropdown(label="Target Voice", choices=["Ana", "Ashe", "Baptiste", "Brigitte", "Cassidy", "Doomfist", "D.Va", "Echo", "Genji", "Hanzo", "Junker Queen", "Junkrat", "Kiriko", "Lúcio", "Mei", "Mercy", "Moira", "Orisa", "Pharah", "Reaper", "Reinhardt", "Roadhog", "Sigma", "Sojourn", "Soldier_ 76", "Sombra", "Symmetra", "Torbjörn", "Tracer", "Widowmaker", "Winston", "Zarya", "Zenyatta"], type="index", value="Ana"), | |
| gradio.Slider(label="Pitch Shift Input (+12 = up one octave, ⚠️ broken AF ⚠️)", minimum=-12.0, maximum=12.0, value=0, step=1), | |
| gradio.Slider(label="Length Factor (higher = slower speech)", minimum=0.1, maximum=2.0, value=1.0), | |
| gradio.Slider(label="Noise Scale (higher = more expressive and erratic)", minimum=0.0, maximum=2.0, value=.667), | |
| gradio.Slider(label="Noise Scale W (higher = more variation in cadence)", minimum=0.0, maximum=2.0, value=.8) | |
| ], | |
| outputs=[gradio.Audio(label="Audio as Target Voice")], | |
| ) | |
| #demo.launch(share=True) | |
| demo.launch(server_name="0.0.0.0") | |