File size: 4,078 Bytes
f4b9544
 
0041af6
 
f4b9544
 
 
 
 
 
 
 
 
 
5be9b92
f4b9544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0041af6
f4b9544
da41dee
 
f4b9544
 
 
 
 
 
 
 
 
da41dee
 
0041af6
da41dee
 
 
 
0041af6
f4b9544
 
5be9b92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4b9544
 
 
 
 
 
5be9b92
 
 
 
 
 
 
 
 
 
f4b9544
5be9b92
f4b9544
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import torch, torchaudio
import gradio as gr
import time

from hifigan.generator import HifiganGenerator

from acoustic import AcousticModel

from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present

hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu()

acoustic = AcousticModel(False, True)

checkpoint = torch.load("models/acoustic-model-best.pt", map_location=torch.device('cpu'))

consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
acoustic.load_state_dict(checkpoint["acoustic-model"])
acoustic.eval()

#hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft").cpu()#.cuda()

hifigan = HifiganGenerator()
checkpoint = torch.load("models/hifigan-model-best.pt", map_location=torch.device('cpu'))
consume_prefix_in_state_dict_if_present(checkpoint["generator"]["model"], "module.")
hifigan.load_state_dict(checkpoint["generator"]["model"])
hifigan.eval()

def run_conversion(audio_in):
    sr, source = audio_in
    source = torch.Tensor(source)
    
    if source.dim() == 1:
        source = source.unsqueeze(1)
    
    source = source.T

    #resample to 16khz
    source = torchaudio.functional.resample(source, sr, 16000)
    
    #convert to mono
    source = torch.mean(source, dim=0).unsqueeze(0)
    source = source.unsqueeze(0)

    with torch.inference_mode():
        time_start = time.perf_counter()
        
        # Extract speech units
        units = hubert.units(source)
        # Generate target spectrogram
        mel = acoustic.generate(units).transpose(1, 2)
        # Generate audio waveform
        target = hifigan(mel)
        
        result = target.squeeze().cpu().multiply(32767).to(torch.int16).numpy()

        time_end = time.perf_counter()
        time_elapsed = time_end - time_start
        
        print(f"Conversion finished in {time_elapsed} Seconds")
        
        return (16000, result)



with gr.Blocks() as demo:
    with gr.Column():
        gr.Markdown(
        """
        # Soft-VC | Widowmaker
        This is a [Soft-VC model](https://github.com/bshall/soft-vc) trained on Widowmaker from Overwatch, allowing the conversion of any voice to Widowmaker's voice. While lower quality (16kHz), it captures the character fairly well, imo. 
        
        For a multi-speaker model, check out my [sovits-overwatch2](https://huggingface.co/spaces/cjayic/sovits-overwatch2) space!
        
        The acoustic model has been trained for around 100k iterations, the HiFiGAN-Model for around 150k iterations. Quality could likely be improved by training the HiFiGAN further.
        """),
        with gr.Column():
            with gr.Tab("Upload Audio File"):
                with gr.Column():
                    input_audio = gr.Audio(
                        label="Audio to be converted",
                    ).style(
                        container=False,
                    )
                    btn_upload = gr.Button("Widowify", variant="primary").style(full_width=True)
            with gr.Tab("Record Audio"):
                with gr.Column():
                    input_audio_record = gr.Audio(
                        label="Audio to be converted",
                        source="microphone"
                    ).style(
                        container=False,
                    )
                    btn_rec = gr.Button("Widowify", variant="primary").style(full_width=True)
    
        with gr.Row():
            output_audio = gr.Audio(
                label="Converted Audio",
                elem_id="output_audio",
                interactive=False
            ).style(height="auto")

    btn_upload.click(run_conversion, [input_audio], output_audio)
    btn_rec.click(run_conversion, [input_audio_record], output_audio)

    gr.Examples(
        ["examples/jermacraft.wav","examples/Mercy_0000000B0F5.wav","examples/weartie.wav","examples/gman_02.wav"], inputs=[input_audio],
        outputs=[output_audio],
        fn=run_conversion,
        cache_examples=True,
        run_on_click=True
    )

demo.queue()
demo.launch()