Spaces:

cjayic
/

soft-vc-widowmaker

Build error

App Files Files Community

cjayic commited on Dec 31, 2022

Commit

f4b9544

1 Parent(s): b89d7de

init

Browse files

Files changed (18) hide show

.gitattributes +1 -0
LICENSE +21 -0
README.md +4 -4
acoustic/__init__.py +1 -0
acoustic/dataset.py +55 -0
acoustic/model.py +168 -0
acoustic/utils.py +99 -0
app.py +76 -0
examples/jermacraft.wav +3 -0
examples/meatgrinder.wav +3 -0
hifigan/__init__.py +1 -0
hifigan/dataset.py +126 -0
hifigan/discriminator.py +262 -0
hifigan/generator.py +282 -0
hifigan/utils.py +84 -0
models/acoustic-model-100000.pt +3 -0
models/hifigan-model-best.pt +3 -0
requirements.txt +3 -0

.gitattributes CHANGED Viewed

@@ -28,6 +28,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text

 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Benjamin van Niekerk
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Soft Vc Widowmaker
-emoji: 👁
-colorFrom: green
-colorTo: blue
 sdk: gradio
 sdk_version: 3.15.0
 app_file: app.py

 ---
+title: Soft-VC Widowmaker
+emoji: 🕷️
+colorFrom: black
+colorTo: purple
 sdk: gradio
 sdk_version: 3.15.0
 app_file: app.py

acoustic/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import AcousticModel, hubert_discrete, hubert_soft

acoustic/dataset.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+from torch.nn.utils.rnn import pad_sequence
+class MelDataset(Dataset):
+    def __init__(self, root: Path, train: bool = True, discrete: bool = False):
+        self.discrete = discrete
+        self.mels_dir = root / "mels"
+        self.units_dir = root / "discrete" if discrete else root / "soft"
+        pattern = "train/**/*.npy" if train else "dev/**/*.npy"
+        self.metadata = [
+            path.relative_to(self.mels_dir).with_suffix("")
+            for path in self.mels_dir.rglob(pattern)
+        ]
+    def __len__(self):
+        return len(self.metadata)
+    def __getitem__(self, index):
+        path = self.metadata[index]
+        mel_path = self.mels_dir / path
+        units_path = self.units_dir / path
+        mel = np.load(mel_path.with_suffix(".npy")).T
+        units = np.load(units_path.with_suffix(".npy"))
+        length = 2 * units.shape[0]
+        mel = torch.from_numpy(mel[:length, :])
+        mel = F.pad(mel, (0, 0, 1, 0))
+        units = torch.from_numpy(units)
+        if self.discrete:
+            units = units.long()
+        return mel, units
+    def pad_collate(self, batch):
+        mels, units = zip(*batch)
+        mels, units = list(mels), list(units)
+        mels_lengths = torch.tensor([x.size(0) - 1 for x in mels])
+        units_lengths = torch.tensor([x.size(0) for x in units])
+        mels = pad_sequence(mels, batch_first=True)
+        units = pad_sequence(
+            units, batch_first=True, padding_value=100 if self.discrete else 0
+        )
+        return mels, mels_lengths, units, units_lengths

acoustic/model.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+URLS = {
+    "hubert-discrete": "https://github.com/bshall/acoustic-model/releases/download/v0.1/hubert-discrete-d49e1c77.pt",
+    "hubert-soft": "https://github.com/bshall/acoustic-model/releases/download/v0.1/hubert-soft-0321fd7e.pt",
+}
+class AcousticModel(nn.Module):
+    def __init__(self, discrete: bool = False, upsample: bool = True):
+        super().__init__()
+        self.encoder = Encoder(discrete, upsample)
+        self.decoder = Decoder()
+    def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
+        x = self.encoder(x)
+        return self.decoder(x, mels)
+    @torch.inference_mode()
+    def generate(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.encoder(x)
+        return self.decoder.generate(x)
+class Encoder(nn.Module):
+    def __init__(self, discrete: bool = False, upsample: bool = True):
+        super().__init__()
+        self.embedding = nn.Embedding(100 + 1, 256) if discrete else None
+        self.prenet = PreNet(256, 256, 256)
+        self.convs = nn.Sequential(
+            nn.Conv1d(256, 512, 5, 1, 2),
+            nn.ReLU(),
+            nn.InstanceNorm1d(512),
+            nn.ConvTranspose1d(512, 512, 4, 2, 1) if upsample else nn.Identity(),
+            nn.Conv1d(512, 512, 5, 1, 2),
+            nn.ReLU(),
+            nn.InstanceNorm1d(512),
+            nn.Conv1d(512, 512, 5, 1, 2),
+            nn.ReLU(),
+            nn.InstanceNorm1d(512),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.embedding is not None:
+            x = self.embedding(x)
+        x = self.prenet(x)
+        x = self.convs(x.transpose(1, 2))
+        return x.transpose(1, 2)
+class Decoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.prenet = PreNet(128, 256, 256)
+        self.lstm1 = nn.LSTM(512 + 256, 768, batch_first=True)
+        self.lstm2 = nn.LSTM(768, 768, batch_first=True)
+        self.lstm3 = nn.LSTM(768, 768, batch_first=True)
+        self.proj = nn.Linear(768, 128, bias=False)
+    def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
+        mels = self.prenet(mels)
+        x, _ = self.lstm1(torch.cat((x, mels), dim=-1))
+        res = x
+        x, _ = self.lstm2(x)
+        x = res + x
+        res = x
+        x, _ = self.lstm3(x)
+        x = res + x
+        return self.proj(x)
+    @torch.inference_mode()
+    def generate(self, xs: torch.Tensor) -> torch.Tensor:
+        m = torch.zeros(xs.size(0), 128, device=xs.device)
+        h1 = torch.zeros(1, xs.size(0), 768, device=xs.device)
+        c1 = torch.zeros(1, xs.size(0), 768, device=xs.device)
+        h2 = torch.zeros(1, xs.size(0), 768, device=xs.device)
+        c2 = torch.zeros(1, xs.size(0), 768, device=xs.device)
+        h3 = torch.zeros(1, xs.size(0), 768, device=xs.device)
+        c3 = torch.zeros(1, xs.size(0), 768, device=xs.device)
+        mel = []
+        for x in torch.unbind(xs, dim=1):
+            m = self.prenet(m)
+            x = torch.cat((x, m), dim=1).unsqueeze(1)
+            x1, (h1, c1) = self.lstm1(x, (h1, c1))
+            x2, (h2, c2) = self.lstm2(x1, (h2, c2))
+            x = x1 + x2
+            x3, (h3, c3) = self.lstm3(x, (h3, c3))
+            x = x + x3
+            m = self.proj(x).squeeze(1)
+            mel.append(m)
+        return torch.stack(mel, dim=1)
+class PreNet(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        output_size: int,
+        dropout: float = 0.5,
+    ):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size, output_size),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+def _acoustic(
+    name: str,
+    discrete: bool,
+    upsample: bool,
+    pretrained: bool = True,
+    progress: bool = True,
+) -> AcousticModel:
+    acoustic = AcousticModel(discrete, upsample)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(URLS[name], progress=progress)
+        consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
+        acoustic.load_state_dict(checkpoint["acoustic-model"])
+        acoustic.eval()
+    return acoustic
+def hubert_discrete(
+    pretrained: bool = True,
+    progress: bool = True,
+) -> AcousticModel:
+    r"""HuBERT-Discrete acoustic model from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    return _acoustic(
+        "hubert-discrete",
+        discrete=True,
+        upsample=True,
+        pretrained=pretrained,
+        progress=progress,
+    )
+def hubert_soft(
+    pretrained: bool = True,
+    progress: bool = True,
+) -> AcousticModel:
+    r"""HuBERT-Soft acoustic model from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        pretrained (bool): load pretrained weights into the model
+        progress (bool): show progress bar when downloading model
+    """
+    return _acoustic(
+        "hubert-soft",
+        discrete=False,
+        upsample=True,
+        pretrained=pretrained,
+        progress=progress,
+    )

acoustic/utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn.functional as F
+import matplotlib
+import torchaudio.transforms as transforms
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+class Metric:
+    def __init__(self):
+        self.steps = 0
+        self.value = 0
+    def update(self, value):
+        self.steps += 1
+        self.value += (value - self.value) / self.steps
+        return self.value
+    def reset(self):
+        self.steps = 0
+        self.value = 0
+class LogMelSpectrogram(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.melspctrogram = transforms.MelSpectrogram(
+            sample_rate=16000,
+            n_fft=1024,
+            win_length=1024,
+            hop_length=160,
+            center=False,
+            power=1.0,
+            norm="slaney",
+            onesided=True,
+            n_mels=128,
+            mel_scale="slaney",
+        )
+    def forward(self, wav):
+        padding = (1024 - 160) // 2
+        wav = F.pad(wav, (padding, padding), "reflect")
+        mel = self.melspctrogram(wav)
+        logmel = torch.log(torch.clamp(mel, min=1e-5))
+        return logmel
+def save_checkpoint(
+    checkpoint_dir,
+    acoustic,
+    optimizer,
+    step,
+    loss,
+    best,
+    logger,
+):
+    state = {
+        "acoustic-model": acoustic.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "step": step,
+        "loss": loss,
+    }
+    checkpoint_dir.mkdir(exist_ok=True, parents=True)
+    checkpoint_path = checkpoint_dir / f"model-{step}.pt"
+    torch.save(state, checkpoint_path)
+    if best:
+        best_path = checkpoint_dir / "model-best.pt"
+        torch.save(state, best_path)
+    logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
+def load_checkpoint(
+    load_path,
+    acoustic,
+    optimizer,
+    rank,
+    logger,
+):
+    logger.info(f"Loading checkpoint from {load_path}")
+    checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
+    acoustic.load_state_dict(checkpoint["acoustic-model"])
+    if "optimizer" in checkpoint:
+        optimizer.load_state_dict(checkpoint["optimizer"])
+    step = checkpoint.get("step", 0)
+    loss = checkpoint.get("loss", float("inf"))
+    return step, loss
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
+    plt.colorbar(im, ax=ax)
+    fig.canvas.draw()
+    plt.close()
+    return fig

app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch, torchaudio
+import gradio as gr
+from hifigan.generator import HifiganGenerator
+from acoustic import AcousticModel
+#from hifigan.generator import HifiganGenerator
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+hubert = torch.hub.load("bshall/hubert:main", "hubert_soft").cpu()
+acoustic = AcousticModel(False, True)
+checkpoint = torch.load("models/acoustic-model-100000.pt", map_location=torch.device('cpu'))
+consume_prefix_in_state_dict_if_present(checkpoint["acoustic-model"], "module.")
+acoustic.load_state_dict(checkpoint["acoustic-model"])
+acoustic.eval()
+#hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft").cpu()#.cuda()
+hifigan = HifiganGenerator()
+checkpoint = torch.load("models/hifigan-model-best.pt", map_location=torch.device('cpu'))
+consume_prefix_in_state_dict_if_present(checkpoint["generator"]["model"], "module.")
+hifigan.load_state_dict(checkpoint["generator"]["model"])
+hifigan.eval()
+def run_conversion(audio_in):
+    sr, source = audio_in
+    source = torch.Tensor(source)
+    if source.dim() == 1:
+        source = source.unsqueeze(1)
+    source = source.T
+    #resample to 16khz
+    source = torchaudio.functional.resample(source, sr, 16000)
+    #convert to mono
+    source = torch.mean(source, dim=0).unsqueeze(0)
+    source = source.unsqueeze(0)
+    with torch.inference_mode():
+        # Extract speech units
+        units = hubert.units(source)
+        # Generate target spectrogram
+        mel = acoustic.generate(units).transpose(1, 2)
+        # Generate audio waveform
+        target = hifigan(mel)
+        result = target.squeeze().cpu().multiply(32767).to(torch.int16).numpy()
+        return (16000, result)
+with gr.Blocks() as demo:
+    with gr.Column(variant="panel"):
+        with gr.Row(variant="compact"):
+            input_audio = gr.Audio(
+                label="Audio to be converted",
+            ).style(
+                container=False,
+            )
+            btn = gr.Button("Widowify").style(full_width=False)
+            output_audio = gr.Audio(
+                label="Converted Audio",
+                elem_id="output_audio",
+                interactive=False
+            ).style(height="auto")
+    btn.click(run_conversion, input_audio, output_audio)
+    gr.Examples(["examples/jermacraft.wav","examples/meatgrinder.wav"], inputs=[input_audio])
+demo.launch()

examples/jermacraft.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a71412c1b685bf3e1e5bab0685e08fd88a51b18e682613a548ab9e8ca68835c
+size 450510

examples/meatgrinder.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a29324c4e5909f7eff663b3f3a17100fdf36fef1c6707ba16b4175bb21b3cb84
+size 1460740

hifigan/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .generator import hifigan, hifigan_hubert_discrete, hifigan_hubert_soft

hifigan/dataset.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from pathlib import Path
+import math
+import random
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+import torchaudio
+import torchaudio.transforms as transforms
+class LogMelSpectrogram(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.melspctrogram = transforms.MelSpectrogram(
+            sample_rate=16000,
+            n_fft=1024,
+            win_length=1024,
+            hop_length=160,
+            center=False,
+            power=1.0,
+            norm="slaney",
+            onesided=True,
+            n_mels=128,
+            mel_scale="slaney",
+        )
+    def forward(self, wav):
+        wav = F.pad(wav, ((1024 - 160) // 2, (1024 - 160) // 2), "reflect")
+        mel = self.melspctrogram(wav)
+        logmel = torch.log(torch.clamp(mel, min=1e-5))
+        return logmel
+class MelDataset(Dataset):
+    def __init__(
+        self,
+        root: Path,
+        segment_length: int,
+        sample_rate: int,
+        hop_length: int,
+        train: bool = True,
+        finetune: bool = False,
+    ):
+        self.wavs_dir = root / "wavs"
+        self.mels_dir = root / "mels"
+        self.data_dir = self.wavs_dir if not finetune else self.mels_dir
+        self.segment_length = segment_length
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        self.train = train
+        self.finetune = finetune
+        suffix = ".wav" if not finetune else ".npy"
+        pattern = f"train/**/*{suffix}" if train else "dev/**/*{suffix}"
+        self.metadata = [
+            path.relative_to(self.data_dir).with_suffix("")
+            for path in self.data_dir.rglob(pattern)
+        ]
+        self.logmel = LogMelSpectrogram()
+    def __len__(self):
+        return len(self.metadata)
+    def __getitem__(self, index):
+        path = self.metadata[index]
+        wav_path = self.wavs_dir / path
+        info = torchaudio.info(wav_path.with_suffix(".wav"))
+        if info.sample_rate != self.sample_rate:
+            raise ValueError(
+                f"Sample rate {info.sample_rate} doesn't match target of {self.sample_rate}"
+            )
+        if self.finetune:
+            mel_path = self.mels_dir / path
+            src_logmel = torch.from_numpy(np.load(mel_path.with_suffix(".npy")))
+            src_logmel = src_logmel.unsqueeze(0)
+            mel_frames_per_segment = math.ceil(self.segment_length / self.hop_length)
+            mel_diff = src_logmel.size(-1) - mel_frames_per_segment if self.train else 0
+            mel_offset = random.randint(0, max(mel_diff, 0))
+            frame_offset = self.hop_length * mel_offset
+        else:
+            frame_diff = info.num_frames - self.segment_length
+            frame_offset = random.randint(0, max(frame_diff, 0))
+        wav, _ = torchaudio.load(
+            filepath=wav_path.with_suffix(".wav"),
+            frame_offset=frame_offset if self.train else 0,
+            num_frames=self.segment_length if self.train else -1,
+        )
+        if wav.size(-1) < self.segment_length:
+            wav = F.pad(wav, (0, self.segment_length - wav.size(-1)))
+        if not self.finetune and self.train:
+            gain = random.random() * (0.99 - 0.4) + 0.4
+            flip = -1 if random.random() > 0.5 else 1
+            wav = flip * gain * wav / wav.abs().max()
+        tgt_logmel = self.logmel(wav.unsqueeze(0)).squeeze(0)
+        if self.finetune:
+            if self.train:
+                src_logmel = src_logmel[
+                    :, :, mel_offset : mel_offset + mel_frames_per_segment
+                ]
+            if src_logmel.size(-1) < mel_frames_per_segment:
+                src_logmel = F.pad(
+                    src_logmel,
+                    (0, mel_frames_per_segment - src_logmel.size(-1)),
+                    "constant",
+                    src_logmel.min(),
+                )
+        else:
+            src_logmel = tgt_logmel.clone()
+        return wav, src_logmel, tgt_logmel

hifigan/discriminator.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# adopted from https://github.com/jik876/hifi-gan/blob/master/models.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, List
+from hifigan.utils import get_padding
+LRELU_SLOPE = 0.1
+class PeriodDiscriminator(torch.nn.Module):
+    """HiFiGAN Period Discriminator"""
+    def __init__(
+        self,
+        period: int,
+        kernel_size: int = 5,
+        stride: int = 3,
+        use_spectral_norm: bool = False,
+    ) -> None:
+        super().__init__()
+        self.period = period
+        norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    nn.Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(
+                    nn.Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(5, 1), 0),
+                    )
+                ),
+                norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+            ]
+        )
+        self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Args:
+            x (Tensor): input waveform.
+        Returns:
+            [Tensor]: discriminator scores per sample in the batch.
+            [List[Tensor]]: list of features from each convolutional layer.
+        """
+        feat = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            feat.append(x)
+        x = self.conv_post(x)
+        feat.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, feat
+class MultiPeriodDiscriminator(torch.nn.Module):
+    """HiFiGAN Multi-Period Discriminator (MPD)"""
+    def __init__(self):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                PeriodDiscriminator(2),
+                PeriodDiscriminator(3),
+                PeriodDiscriminator(5),
+                PeriodDiscriminator(7),
+                PeriodDiscriminator(11),
+            ]
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
+        """
+        Args:
+            x (Tensor): input waveform.
+        Returns:
+            [List[Tensor]]: list of scores from each discriminator.
+            [List[List[Tensor]]]: list of features from each discriminator's convolutional layers.
+        """
+        scores = []
+        feats = []
+        for _, d in enumerate(self.discriminators):
+            score, feat = d(x)
+            scores.append(score)
+            feats.append(feat)
+        return scores, feats
+class ScaleDiscriminator(torch.nn.Module):
+    """HiFiGAN Scale Discriminator."""
+    def __init__(self, use_spectral_norm: bool = False) -> None:
+        super().__init__()
+        norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(nn.Conv1d(1, 128, 15, 1, padding=7)),
+                norm_f(nn.Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+                norm_f(nn.Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+                norm_f(nn.Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+                norm_f(nn.Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+                norm_f(nn.Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+                norm_f(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(nn.Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Args:
+            x (Tensor): input waveform.
+        Returns:
+            Tensor: discriminator scores.
+            List[Tensor]: list of features from the convolutional layers.
+        """
+        feat = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            feat.append(x)
+        x = self.conv_post(x)
+        feat.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, feat
+class MultiScaleDiscriminator(torch.nn.Module):
+    """HiFiGAN Multi-Scale Discriminator."""
+    def __init__(self):
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [
+                ScaleDiscriminator(use_spectral_norm=True),
+                ScaleDiscriminator(),
+                ScaleDiscriminator(),
+            ]
+        )
+        self.meanpools = nn.ModuleList(
+            [nn.AvgPool1d(4, 2, padding=2), nn.AvgPool1d(4, 2, padding=2)]
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
+        """
+        Args:
+            x (Tensor): input waveform.
+        Returns:
+            List[Tensor]: discriminator scores.
+            List[List[Tensor]]: list of features from each discriminator's convolutional layers.
+        """
+        scores = []
+        feats = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                x = self.meanpools[i - 1](x)
+            score, feat = d(x)
+            scores.append(score)
+            feats.append(feat)
+        return scores, feats
+class HifiganDiscriminator(nn.Module):
+    """HiFiGAN discriminator"""
+    def __init__(self):
+        super().__init__()
+        self.mpd = MultiPeriodDiscriminator()
+        self.msd = MultiScaleDiscriminator()
+    def forward(
+        self, x: torch.Tensor
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
+        """
+        Args:
+            x (Tensor): input waveform.
+        Returns:
+            List[Tensor]: discriminator scores.
+            List[List[Tensor]]: list of features from from each discriminator's convolutional layers.
+        """
+        scores, feats = self.mpd(x)
+        scores_, feats_ = self.msd(x)
+        return scores + scores_, feats + feats_
+def feature_loss(
+    features_real: List[List[torch.Tensor]], features_generate: List[List[torch.Tensor]]
+) -> float:
+    loss = 0
+    for r, g in zip(features_real, features_generate):
+        for rl, gl in zip(r, g):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(real, generated):
+    loss = 0
+    real_losses = []
+    generated_losses = []
+    for r, g in zip(real, generated):
+        r_loss = torch.mean((1 - r) ** 2)
+        g_loss = torch.mean(g ** 2)
+        loss += r_loss + g_loss
+        real_losses.append(r_loss.item())
+        generated_losses.append(g_loss.item())
+    return loss, real_losses, generated_losses
+def generator_loss(discriminator_outputs):
+    loss = 0
+    generator_losses = []
+    for x in discriminator_outputs:
+        l = torch.mean((1 - x) ** 2)
+        generator_losses.append(l)
+        loss += l
+    return loss, generator_losses

hifigan/generator.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# adapted from https://github.com/jik876/hifi-gan/blob/master/models.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import remove_weight_norm, weight_norm
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+from typing import Tuple
+from hifigan.utils import get_padding
+URLS = {
+    "hifigan": "https://github.com/bshall/hifigan/releases/download/v0.1/hifigan-67926ec6.pt",
+    "hifigan-hubert-soft": "https://github.com/bshall/hifigan/releases/download/v0.1/hifigan-hubert-discrete-bbad3043.pt",
+    "hifigan-hubert-discrete": "https://github.com/bshall/hifigan/releases/download/v0.1/hifigan-hubert-soft-65f03469.pt",
+}
+LRELU_SLOPE = 0.1
+class HifiganGenerator(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 128,
+        resblock_dilation_sizes: Tuple[Tuple[int, ...], ...] = (
+            (1, 3, 5),
+            (1, 3, 5),
+            (1, 3, 5),
+        ),
+        resblock_kernel_sizes: Tuple[int, ...] = (3, 7, 11),
+        upsample_kernel_sizes: Tuple[int, ...] = (20, 8, 4, 4),
+        upsample_initial_channel: int = 512,
+        upsample_factors: int = (10, 4, 2, 2),
+        inference_padding: int = 5,
+        sample_rate: int = 16000,
+    ) -> None:
+        r"""HiFiGAN Generator
+        Args:
+            in_channels (int): number of input channels.
+            resblock_dilation_sizes (Tuple[Tuple[int, ...], ...]): list of dilation values in each layer of a `ResBlock`.
+            resblock_kernel_sizes (Tuple[int, ...]): list of kernel sizes for each `ResBlock`.
+            upsample_kernel_sizes (Tuple[int, ...]): list of kernel sizes for each transposed convolution.
+            upsample_initial_channel (int): number of channels for the first upsampling layer. This is divided by 2
+                for each consecutive upsampling layer.
+            upsample_factors (Tuple[int, ...]): upsampling factors (stride) for each upsampling layer.
+            inference_padding (int): constant padding applied to the input at inference time.
+            sample_rate (int): sample rate of the generated audio.
+        """
+        super().__init__()
+        self.inference_padding = inference_padding
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_factors)
+        self.sample_rate = sample_rate
+        # initial upsampling layers
+        self.conv_pre = weight_norm(
+            nn.Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)
+        )
+        # upsampling layers
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_factors, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    nn.ConvTranspose1d(
+                        upsample_initial_channel // (2 ** i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # MRF blocks
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for _, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(ResBlock1(ch, k, d))
+        # post convolution layer
+        self.conv_post = weight_norm(nn.Conv1d(ch, 1, 7, 1, padding=3))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        o = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            o = F.leaky_relu(o, LRELU_SLOPE)
+            o = self.ups[i](o)
+            z_sum = None
+            for j in range(self.num_kernels):
+                if z_sum is None:
+                    z_sum = self.resblocks[i * self.num_kernels + j](o)
+                else:
+                    z_sum += self.resblocks[i * self.num_kernels + j](o)
+            o = z_sum / self.num_kernels
+        o = F.leaky_relu(o)
+        o = self.conv_post(o)
+        o = torch.tanh(o)
+        return o
+    @torch.no_grad()
+    def generate(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.pad(x, (self.inference_padding, self.inference_padding), "replicate")
+        return self(x), self.sample_rate
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class ResBlock1(torch.nn.Module):
+    def __init__(
+        self, channels: int, kernel_size: int = 3, dilation: Tuple[int, ...] = (1, 3, 5)
+    ) -> None:
+        super().__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(
+        self, channels: int, kernel_size: int = 3, dilation: Tuple[int, ...] = (1, 3)
+    ) -> None:
+        super().__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+def _hifigan(
+    name: str,
+    pretrained: bool = True,
+    progress: bool = True,
+    map_location=None,
+) -> HifiganGenerator:
+    hifigan = HifiganGenerator()
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS[name], map_location=map_location, progress=progress
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+        hifigan.load_state_dict(checkpoint)
+        hifigan.eval()
+        hifigan.remove_weight_norm()
+    return hifigan
+def hifigan(
+    pretrained: bool = True, progress: bool = True, map_location=None
+) -> HifiganGenerator:
+    return _hifigan("hifigan", pretrained, progress, map_location)
+def hifigan_hubert_soft(
+    pretrained: bool = True, progress: bool = True, map_location=None
+) -> HifiganGenerator:
+    return _hifigan("hifigan-hubert-soft", pretrained, progress, map_location=None)
+def hifigan_hubert_discrete(
+    pretrained: bool = True, progress: bool = True, map_location=None
+) -> HifiganGenerator:
+    return _hifigan("hifigan-hubert-discrete", pretrained, progress, map_location=None)

hifigan/utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+def get_padding(k, d):
+    return int((k * d - d) / 2)
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
+    plt.colorbar(im, ax=ax)
+    fig.canvas.draw()
+    plt.close()
+    return fig
+def save_checkpoint(
+    checkpoint_dir,
+    generator,
+    discriminator,
+    optimizer_generator,
+    optimizer_discriminator,
+    scheduler_generator,
+    scheduler_discriminator,
+    step,
+    loss,
+    best,
+    logger,
+):
+    state = {
+        "generator": {
+            "model": generator.state_dict(),
+            "optimizer": optimizer_generator.state_dict(),
+            "scheduler": scheduler_generator.state_dict(),
+        },
+        "discriminator": {
+            "model": discriminator.state_dict(),
+            "optimizer": optimizer_discriminator.state_dict(),
+            "scheduler": scheduler_discriminator.state_dict(),
+        },
+        "step": step,
+        "loss": loss,
+    }
+    checkpoint_dir.mkdir(exist_ok=True, parents=True)
+    checkpoint_path = checkpoint_dir / f"model-{step}.pt"
+    torch.save(state, checkpoint_path)
+    if best:
+        best_path = checkpoint_dir / "model-best.pt"
+        torch.save(state, best_path)
+    logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
+def load_checkpoint(
+    load_path,
+    generator,
+    discriminator,
+    optimizer_generator,
+    optimizer_discriminator,
+    scheduler_generator,
+    scheduler_discriminator,
+    rank,
+    logger,
+    finetune=False,
+):
+    logger.info(f"Loading checkpoint from {load_path}")
+    checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
+    generator.load_state_dict(checkpoint["generator"]["model"])
+    discriminator.load_state_dict(checkpoint["discriminator"]["model"])
+    if not finetune:
+        optimizer_generator.load_state_dict(checkpoint["generator"]["optimizer"])
+        scheduler_generator.load_state_dict(checkpoint["generator"]["scheduler"])
+        optimizer_discriminator.load_state_dict(
+            checkpoint["discriminator"]["optimizer"]
+        )
+        scheduler_discriminator.load_state_dict(
+            checkpoint["discriminator"]["scheduler"]
+        )
+    return checkpoint["step"], checkpoint["loss"]

models/acoustic-model-100000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bab1ca079f6d3454cbe20be736c2fea003ddb8425acf5a451bc0b8e8975d6d99
+size 225997291

models/hifigan-model-best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2c4c04b6a829854ccd9eb5eac3b0f7a434fc1e94809e6662e2be79e6f930c49
+size 1021686329

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+torchaudio
+gradio