model: class_path: linacodec.model.LinaCodecModel init_args: config: # SSL Feature settings local_ssl_layers: [6, 9] global_ssl_layers: [1, 2] normalize_ssl_features: true # Down/up-sampling settings downsample_factor: 4 mel_upsample_factor: 8 use_conv_downsample: true mel_interpolation_mode: linear # Audio settings sample_rate: 24000 n_fft: 1024 hop_length: 256 n_mels: 100 padding: center ssl_feature_extractor: class_path: linacodec.module.ssl_extractor.SSLFeatureExtractor init_args: model_name: wavlm_base_plus output_layer: 2 # Use at most 2 layers sample_rate: 24000 # Consistent to the target sample rate for reconstruction local_encoder: class_path: linacodec.module.transformer.Transformer init_args: dim: 768 n_layers: 6 n_heads: 12 window_size: 125 use_rope: true rope_theta: 10000.0 max_seq_len: 512 use_flash_attention: true local_quantizer: class_path: linacodec.module.fsq.FiniteScalarQuantizer init_args: input_dim: 768 # Must match local encoder output dimension output_dim: 768 # Must match feature decoder input dimension levels: [8, 8, 8, 5, 5] # 12800 feature_decoder: class_path: linacodec.module.transformer.Transformer init_args: dim: 768 n_layers: 6 n_heads: 12 window_size: 125 use_rope: true rope_theta: 10000.0 max_seq_len: 512 use_flash_attention: true global_encoder: class_path: linacodec.module.global_encoder.GlobalEncoder init_args: input_channels: 768 # WavLM base plus feature dimension output_channels: 128 num_layers: 4 dim: 384 intermediate_dim: 1152 mel_prenet: class_path: linacodec.module.transformer.Transformer init_args: dim: 768 output_dim: 512 n_layers: 6 n_heads: 12 window_size: 31 use_rope: true rope_theta: 10000.0 max_seq_len: 512 use_flash_attention: true mel_decoder: class_path: linacodec.module.transformer.Transformer init_args: dim: 512 output_dim: 100 # Number of mel frequency bins n_layers: 6 n_heads: 8 window_size: 65 use_rope: true rope_theta: 10000.0 max_seq_len: 512 adanorm_condition_dim: 128 # Must match global encoder output dimension use_adaln_zero: true # Use AdaLNZero for conditioning use_flash_attention: true mel_postnet: class_path: linacodec.module.postnet.PostNet init_args: input_channels: 100 # Number of mel frequency bins channels: 256 kernel_size: 7 num_layers: 4 use_layer_norm: true