from transformers.configuration_utils import PretrainedConfig from typing import List class HeartCodecConfig(PretrainedConfig): model_type = "heartcodec" def __init__( self, # config for rvq dim: int = 512, codebook_size: int = 8192, decay: float = 0.9, commitment_weight: float = 1.0, threshold_ema_dead_code: int = 2, use_cosine_sim: bool = False, codebook_dim: int = 32, num_quantizers: int = 8, # config for diffusion transformer attention_head_dim: int = 64, in_channels: int = 1024, norm_type: str = "ada_norm_single", num_attention_heads: int = 24, num_layers: int = 24, num_layers_2: int = 6, out_channels: int = 256, # config for sq codec num_bands: int = 1, sample_rate: int = 48000, causal: bool = True, num_samples: int = 2, downsample_factors: List[int] = [3, 4, 4, 4, 5], downsample_kernel_sizes: List[int] = [6, 8, 8, 8, 10], upsample_factors: List[int] = [5, 4, 4, 4, 3], upsample_kernel_sizes: List[int] = [10, 8, 8, 8, 6], latent_hidden_dim: int = 128, default_kernel_size: int = 7, delay_kernel_size: int = 5, init_channel: int = 64, res_kernel_size: int = 7, **kwargs ): super().__init__(**kwargs) self.dim = dim self.codebook_size = codebook_size self.decay = decay self.commitment_weight = commitment_weight self.threshold_ema_dead_code = threshold_ema_dead_code self.use_cosine_sim = use_cosine_sim self.codebook_dim = codebook_dim self.num_quantizers = num_quantizers self.attention_head_dim = attention_head_dim self.in_channels = in_channels self.norm_type = norm_type self.num_attention_heads = num_attention_heads self.num_layers = num_layers self.num_layers_2 = num_layers_2 self.out_channels = out_channels self.num_bands = num_bands self.sample_rate = sample_rate self.causal = causal self.num_samples = num_samples self.downsample_factors = downsample_factors self.downsample_kernel_sizes = downsample_kernel_sizes self.upsample_factors = upsample_factors self.upsample_kernel_sizes = upsample_kernel_sizes self.latent_hidden_dim = latent_hidden_dim self.default_kernel_size = default_kernel_size self.delay_kernel_size = delay_kernel_size self.init_channel = init_channel self.res_kernel_size = res_kernel_size