LinaCodec / config.yaml
YatharthS's picture
Update config.yaml
6aa570c verified
model:
class_path: linacodec.model.LinaCodecModel
init_args:
config:
# SSL Feature settings
local_ssl_layers: [6, 9]
global_ssl_layers: [1, 2]
normalize_ssl_features: true
# Down/up-sampling settings
downsample_factor: 4
mel_upsample_factor: 8
use_conv_downsample: true
mel_interpolation_mode: linear
# Audio settings
sample_rate: 24000
n_fft: 1024
hop_length: 256
n_mels: 100
padding: center
ssl_feature_extractor:
class_path: linacodec.module.ssl_extractor.SSLFeatureExtractor
init_args:
model_name: wavlm_base_plus
output_layer: 2 # Use at most 2 layers
sample_rate: 24000 # Consistent to the target sample rate for reconstruction
local_encoder:
class_path: linacodec.module.transformer.Transformer
init_args:
dim: 768
n_layers: 6
n_heads: 12
window_size: 125
use_rope: true
rope_theta: 10000.0
max_seq_len: 512
use_flash_attention: true
local_quantizer:
class_path: linacodec.module.fsq.FiniteScalarQuantizer
init_args:
input_dim: 768 # Must match local encoder output dimension
output_dim: 768 # Must match feature decoder input dimension
levels: [8, 8, 8, 5, 5] # 12800
feature_decoder:
class_path: linacodec.module.transformer.Transformer
init_args:
dim: 768
n_layers: 6
n_heads: 12
window_size: 125
use_rope: true
rope_theta: 10000.0
max_seq_len: 512
use_flash_attention: true
global_encoder:
class_path: linacodec.module.global_encoder.GlobalEncoder
init_args:
input_channels: 768 # WavLM base plus feature dimension
output_channels: 128
num_layers: 4
dim: 384
intermediate_dim: 1152
mel_prenet:
class_path: linacodec.module.transformer.Transformer
init_args:
dim: 768
output_dim: 512
n_layers: 6
n_heads: 12
window_size: 31
use_rope: true
rope_theta: 10000.0
max_seq_len: 512
use_flash_attention: true
mel_decoder:
class_path: linacodec.module.transformer.Transformer
init_args:
dim: 512
output_dim: 100 # Number of mel frequency bins
n_layers: 6
n_heads: 8
window_size: 65
use_rope: true
rope_theta: 10000.0
max_seq_len: 512
adanorm_condition_dim: 128 # Must match global encoder output dimension
use_adaln_zero: true # Use AdaLNZero for conditioning
use_flash_attention: true
mel_postnet:
class_path: linacodec.module.postnet.PostNet
init_args:
input_channels: 100 # Number of mel frequency bins
channels: 256
kernel_size: 7
num_layers: 4
use_layer_norm: true