Spaces:
Build error
Build error
layout update, examples, update models
Browse files- app.py +42 -13
- examples/{meatgrinder.wav → Mercy_0000000B0F5.wav} +2 -2
- examples/gman_02.wav +3 -0
- examples/weartie.wav +3 -0
- models/acoustic-model-best.pt +3 -0
- models/hifigan-model-best.pt +1 -1
app.py
CHANGED
|
@@ -4,14 +4,13 @@ from hifigan.generator import HifiganGenerator
|
|
| 4 |
|
| 5 |
from acoustic import AcousticModel
|
| 6 |
|
| 7 |
-
#from hifigan.generator import HifiganGenerator
|
| 8 |
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
| 9 |
|
| 10 |
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu()
|
| 11 |
|
| 12 |
acoustic = AcousticModel(False, True)
|
| 13 |
|
| 14 |
-
checkpoint = torch.load("models/acoustic-model-
|
| 15 |
|
| 16 |
consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
|
| 17 |
acoustic.load_state_dict(checkpoint["acoustic-model"])
|
|
@@ -27,7 +26,6 @@ hifigan.eval()
|
|
| 27 |
|
| 28 |
def run_conversion(audio_in):
|
| 29 |
sr, source = audio_in
|
| 30 |
-
|
| 31 |
source = torch.Tensor(source)
|
| 32 |
|
| 33 |
if source.dim() == 1:
|
|
@@ -56,21 +54,52 @@ def run_conversion(audio_in):
|
|
| 56 |
|
| 57 |
|
| 58 |
with gr.Blocks() as demo:
|
| 59 |
-
with gr.Column(
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
output_audio = gr.Audio(
|
| 68 |
label="Converted Audio",
|
| 69 |
elem_id="output_audio",
|
| 70 |
interactive=False
|
| 71 |
).style(height="auto")
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
|
|
|
| 76 |
demo.launch()
|
|
|
|
| 4 |
|
| 5 |
from acoustic import AcousticModel
|
| 6 |
|
|
|
|
| 7 |
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
| 8 |
|
| 9 |
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu()
|
| 10 |
|
| 11 |
acoustic = AcousticModel(False, True)
|
| 12 |
|
| 13 |
+
checkpoint = torch.load("models/acoustic-model-best.pt", map_location=torch.device('cpu'))
|
| 14 |
|
| 15 |
consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
|
| 16 |
acoustic.load_state_dict(checkpoint["acoustic-model"])
|
|
|
|
| 26 |
|
| 27 |
def run_conversion(audio_in):
|
| 28 |
sr, source = audio_in
|
|
|
|
| 29 |
source = torch.Tensor(source)
|
| 30 |
|
| 31 |
if source.dim() == 1:
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
with gr.Blocks() as demo:
|
| 57 |
+
with gr.Column():
|
| 58 |
+
gr.Markdown(
|
| 59 |
+
"""
|
| 60 |
+
# Soft-VC | Widowmaker
|
| 61 |
+
This is a [Soft-VC model](https://github.com/bshall/soft-vc) trained on Widowmaker from Overwatch, allowing the conversion of any voice to Widowmaker's voice. While lower quality (16kHz), it captures the character fairly well, imo.
|
| 62 |
+
|
| 63 |
+
For a multi-speaker model, check out my [sovits-overwatch2](https://huggingface.co/spaces/cjayic/sovits-overwatch2) space!
|
| 64 |
+
|
| 65 |
+
The acoustic model has been trained for around 100k iterations, the HiFiGAN-Model for around 150k iterations. Quality could likely be improved by training the HiFiGAN further.
|
| 66 |
+
"""),
|
| 67 |
+
with gr.Column():
|
| 68 |
+
with gr.Tab("Upload Audio File"):
|
| 69 |
+
with gr.Column():
|
| 70 |
+
input_audio = gr.Audio(
|
| 71 |
+
label="Audio to be converted",
|
| 72 |
+
).style(
|
| 73 |
+
container=False,
|
| 74 |
+
)
|
| 75 |
+
btn_upload = gr.Button("Widowify", variant="primary").style(full_width=True)
|
| 76 |
+
with gr.Tab("Record Audio"):
|
| 77 |
+
with gr.Column():
|
| 78 |
+
input_audio_record = gr.Audio(
|
| 79 |
+
label="Audio to be converted",
|
| 80 |
+
source="microphone"
|
| 81 |
+
).style(
|
| 82 |
+
container=False,
|
| 83 |
+
)
|
| 84 |
+
btn_rec = gr.Button("Widowify", variant="primary").style(full_width=True)
|
| 85 |
+
|
| 86 |
+
with gr.Row():
|
| 87 |
output_audio = gr.Audio(
|
| 88 |
label="Converted Audio",
|
| 89 |
elem_id="output_audio",
|
| 90 |
interactive=False
|
| 91 |
).style(height="auto")
|
| 92 |
|
| 93 |
+
btn_upload.click(run_conversion, [input_audio], output_audio)
|
| 94 |
+
btn_rec.click(run_conversion, [input_audio_record], output_audio)
|
| 95 |
+
|
| 96 |
+
gr.Examples(
|
| 97 |
+
["examples/jermacraft.wav","examples/Mercy_0000000B0F5.wav","examples/weartie.wav","examples/gman_02.wav"], inputs=[input_audio],
|
| 98 |
+
outputs=[output_audio],
|
| 99 |
+
fn=run_conversion,
|
| 100 |
+
cache_examples=True,
|
| 101 |
+
run_on_click=True
|
| 102 |
+
)
|
| 103 |
|
| 104 |
+
demo.queue()
|
| 105 |
demo.launch()
|
examples/{meatgrinder.wav → Mercy_0000000B0F5.wav}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d82da53b88f1169a4e12eec15506d8b85029293b2a0db42e28e4a7a31a4914a
|
| 3 |
+
size 162958
|
examples/gman_02.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cffbf4dda570a1c1c7a92e807f5a3f1b1418d77c5e4fad8da590fbc23c4ff07
|
| 3 |
+
size 1850254
|
examples/weartie.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d843afa411d7e975299397e2435e892b277abda45cc64f0771da66cfb3490514
|
| 3 |
+
size 110982
|
models/acoustic-model-best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdd7921e3b44db3204008cb2d0428c5917c924dcd1f7b0285ab7e1d48e51e24c
|
| 3 |
+
size 225997291
|
models/hifigan-model-best.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1021686329
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d71c85ed4c2ae2285222330a467c38be2b9a33d29f225d8c1568ee558d98694
|
| 3 |
size 1021686329
|