|
|
model: |
|
|
class_path: linacodec.model.LinaCodecModel |
|
|
init_args: |
|
|
config: |
|
|
|
|
|
local_ssl_layers: [6, 9] |
|
|
global_ssl_layers: [1, 2] |
|
|
normalize_ssl_features: true |
|
|
|
|
|
|
|
|
downsample_factor: 4 |
|
|
mel_upsample_factor: 8 |
|
|
use_conv_downsample: true |
|
|
mel_interpolation_mode: linear |
|
|
|
|
|
|
|
|
sample_rate: 24000 |
|
|
n_fft: 1024 |
|
|
hop_length: 256 |
|
|
n_mels: 100 |
|
|
padding: center |
|
|
|
|
|
ssl_feature_extractor: |
|
|
class_path: linacodec.module.ssl_extractor.SSLFeatureExtractor |
|
|
init_args: |
|
|
model_name: wavlm_base_plus |
|
|
output_layer: 2 |
|
|
sample_rate: 24000 |
|
|
|
|
|
local_encoder: |
|
|
class_path: linacodec.module.transformer.Transformer |
|
|
init_args: |
|
|
dim: 768 |
|
|
n_layers: 6 |
|
|
n_heads: 12 |
|
|
window_size: 125 |
|
|
use_rope: true |
|
|
rope_theta: 10000.0 |
|
|
max_seq_len: 512 |
|
|
use_flash_attention: true |
|
|
|
|
|
local_quantizer: |
|
|
class_path: linacodec.module.fsq.FiniteScalarQuantizer |
|
|
init_args: |
|
|
input_dim: 768 |
|
|
output_dim: 768 |
|
|
levels: [8, 8, 8, 5, 5] |
|
|
|
|
|
feature_decoder: |
|
|
class_path: linacodec.module.transformer.Transformer |
|
|
init_args: |
|
|
dim: 768 |
|
|
n_layers: 6 |
|
|
n_heads: 12 |
|
|
window_size: 125 |
|
|
use_rope: true |
|
|
rope_theta: 10000.0 |
|
|
max_seq_len: 512 |
|
|
use_flash_attention: true |
|
|
|
|
|
global_encoder: |
|
|
class_path: linacodec.module.global_encoder.GlobalEncoder |
|
|
init_args: |
|
|
input_channels: 768 |
|
|
output_channels: 128 |
|
|
num_layers: 4 |
|
|
dim: 384 |
|
|
intermediate_dim: 1152 |
|
|
|
|
|
mel_prenet: |
|
|
class_path: linacodec.module.transformer.Transformer |
|
|
init_args: |
|
|
dim: 768 |
|
|
output_dim: 512 |
|
|
n_layers: 6 |
|
|
n_heads: 12 |
|
|
window_size: 31 |
|
|
use_rope: true |
|
|
rope_theta: 10000.0 |
|
|
max_seq_len: 512 |
|
|
use_flash_attention: true |
|
|
|
|
|
mel_decoder: |
|
|
class_path: linacodec.module.transformer.Transformer |
|
|
init_args: |
|
|
dim: 512 |
|
|
output_dim: 100 |
|
|
n_layers: 6 |
|
|
n_heads: 8 |
|
|
window_size: 65 |
|
|
use_rope: true |
|
|
rope_theta: 10000.0 |
|
|
max_seq_len: 512 |
|
|
adanorm_condition_dim: 128 |
|
|
use_adaln_zero: true |
|
|
use_flash_attention: true |
|
|
|
|
|
mel_postnet: |
|
|
class_path: linacodec.module.postnet.PostNet |
|
|
init_args: |
|
|
input_channels: 100 |
|
|
channels: 256 |
|
|
kernel_size: 7 |
|
|
num_layers: 4 |
|
|
use_layer_norm: true |
|
|
|