File size: 2,657 Bytes
ed8503f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from transformers.configuration_utils import PretrainedConfig
from typing import List


class HeartCodecConfig(PretrainedConfig):
    model_type = "heartcodec"

    def __init__(
        self,
        # config for rvq
        dim: int = 512,
        codebook_size: int = 8192,
        decay: float = 0.9,
        commitment_weight: float = 1.0,
        threshold_ema_dead_code: int = 2,
        use_cosine_sim: bool = False,
        codebook_dim: int = 32,
        num_quantizers: int = 8,
        # config for diffusion transformer
        attention_head_dim: int = 64,
        in_channels: int = 1024,
        norm_type: str = "ada_norm_single",
        num_attention_heads: int = 24,
        num_layers: int = 24,
        num_layers_2: int = 6,
        out_channels: int = 256,
        # config for sq codec
        num_bands: int = 1,
        sample_rate: int = 48000,
        causal: bool = True,
        num_samples: int = 2,
        downsample_factors: List[int] = [3, 4, 4, 4, 5],
        downsample_kernel_sizes: List[int] = [6, 8, 8, 8, 10],
        upsample_factors: List[int] = [5, 4, 4, 4, 3],
        upsample_kernel_sizes: List[int] = [10, 8, 8, 8, 6],
        latent_hidden_dim: int = 128,
        default_kernel_size: int = 7,
        delay_kernel_size: int = 5,
        init_channel: int = 64,
        res_kernel_size: int = 7,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.dim = dim
        self.codebook_size = codebook_size
        self.decay = decay
        self.commitment_weight = commitment_weight
        self.threshold_ema_dead_code = threshold_ema_dead_code
        self.use_cosine_sim = use_cosine_sim
        self.codebook_dim = codebook_dim
        self.num_quantizers = num_quantizers

        self.attention_head_dim = attention_head_dim
        self.in_channels = in_channels
        self.norm_type = norm_type
        self.num_attention_heads = num_attention_heads
        self.num_layers = num_layers
        self.num_layers_2 = num_layers_2
        self.out_channels = out_channels

        self.num_bands = num_bands
        self.sample_rate = sample_rate
        self.causal = causal
        self.num_samples = num_samples
        self.downsample_factors = downsample_factors
        self.downsample_kernel_sizes = downsample_kernel_sizes
        self.upsample_factors = upsample_factors
        self.upsample_kernel_sizes = upsample_kernel_sizes
        self.latent_hidden_dim = latent_hidden_dim
        self.default_kernel_size = default_kernel_size
        self.delay_kernel_size = delay_kernel_size
        self.init_channel = init_channel
        self.res_kernel_size = res_kernel_size