File size: 3,172 Bytes
4689c2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
from typing import Literal, Optional

import torch
import torch.nn as nn
from mmgp import offload
from shared.utils import files_locator as fl

from ..autoencoder.vae import VAE, get_my_vae
from ..bigvgan import BigVGAN
from ..bigvgan_v2.bigvgan import BigVGAN as BigVGANv2
from ...model.utils.distributions import DiagonalGaussianDistribution

_BIGVGAN_V2_FOLDER = "bigvgan_v2_44khz_128band_512x"


def _resolve_bigvgan_v2_files():
    weights_path = fl.locate_file(
        os.path.join(_BIGVGAN_V2_FOLDER, "bigvgan_generator.pt"), error_if_none=False
    )
    config_path = fl.locate_file(
        os.path.join(_BIGVGAN_V2_FOLDER, "config.json"), error_if_none=False
    )
    if weights_path is None or config_path is None:
        raise FileNotFoundError(
            f"Missing BigVGANv2 files in '{_BIGVGAN_V2_FOLDER}'. "
            "Expected 'config.json' and 'bigvgan_generator.pt'."
        )
    return weights_path, config_path


def _preprocess_bigvgan_v2_state_dict(state_dict, quantization_map=None, tied_weights_map=None):
    if isinstance(state_dict, dict) and isinstance(state_dict.get("generator"), dict):
        state_dict = state_dict["generator"]
    return state_dict, quantization_map, tied_weights_map


class AutoEncoderModule(nn.Module):

    def __init__(self,

                 *,

                 vae_ckpt_path,

                 vocoder_ckpt_path: Optional[str] = None,

                 mode: Literal['16k', '44k'],

                 need_vae_encoder: bool = True):
        super().__init__()
        self.vae: VAE = get_my_vae(mode).eval()
        vae_state_dict = torch.load(vae_ckpt_path, weights_only=True, map_location='cpu')
        self.vae.load_state_dict(vae_state_dict)
        self.vae.remove_weight_norm()

        if mode == '16k':
            assert vocoder_ckpt_path is not None
            self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
        elif mode == '44k':
            vocoder_ckpt_path, vocoder_config_path = _resolve_bigvgan_v2_files()
            self.vocoder = offload.fast_load_transformers_model(
                vocoder_ckpt_path,
                modelClass=BigVGANv2,
                forcedConfigPath=vocoder_config_path,
                preprocess_sd=_preprocess_bigvgan_v2_state_dict,
                configKwargs={"use_cuda_kernel": False},
                writable_tensors=False,
                default_dtype=torch.float32,
            )
            self.vocoder.remove_weight_norm()
            self.vocoder.eval()
        else:
            raise ValueError(f'Unknown mode: {mode}')

        for param in self.parameters():
            param.requires_grad = False

        if not need_vae_encoder:
            del self.vae.encoder

    @torch.inference_mode()
    def encode(self, x: torch.Tensor) -> DiagonalGaussianDistribution:
        return self.vae.encode(x)

    @torch.inference_mode()
    def decode(self, z: torch.Tensor) -> torch.Tensor:
        return self.vae.decode(z)

    @torch.inference_mode()
    def vocode(self, spec: torch.Tensor) -> torch.Tensor:
        return self.vocoder(spec)