File size: 2,924 Bytes
aac23d4
6aa570c
aac23d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
model:
  class_path: linacodec.model.LinaCodecModel
  init_args:
    config:
      # SSL Feature settings
      local_ssl_layers: [6, 9]
      global_ssl_layers: [1, 2]
      normalize_ssl_features: true

      # Down/up-sampling settings
      downsample_factor: 4
      mel_upsample_factor: 8
      use_conv_downsample: true
      mel_interpolation_mode: linear

      # Audio settings
      sample_rate: 24000
      n_fft: 1024
      hop_length: 256
      n_mels: 100
      padding: center

    ssl_feature_extractor:
      class_path: linacodec.module.ssl_extractor.SSLFeatureExtractor
      init_args:
        model_name: wavlm_base_plus
        output_layer: 2 # Use at most 2 layers
        sample_rate: 24000 # Consistent to the target sample rate for reconstruction

    local_encoder:
      class_path: linacodec.module.transformer.Transformer
      init_args:
        dim: 768
        n_layers: 6
        n_heads: 12
        window_size: 125
        use_rope: true
        rope_theta: 10000.0
        max_seq_len: 512
        use_flash_attention: true

    local_quantizer:
      class_path: linacodec.module.fsq.FiniteScalarQuantizer
      init_args:
        input_dim: 768 # Must match local encoder output dimension
        output_dim: 768 # Must match feature decoder input dimension
        levels: [8, 8, 8, 5, 5] # 12800

    feature_decoder:
      class_path: linacodec.module.transformer.Transformer
      init_args:
        dim: 768
        n_layers: 6
        n_heads: 12
        window_size: 125
        use_rope: true
        rope_theta: 10000.0
        max_seq_len: 512
        use_flash_attention: true

    global_encoder:
      class_path: linacodec.module.global_encoder.GlobalEncoder
      init_args:
        input_channels: 768 # WavLM base plus feature dimension
        output_channels: 128
        num_layers: 4
        dim: 384
        intermediate_dim: 1152

    mel_prenet:
      class_path: linacodec.module.transformer.Transformer
      init_args:
        dim: 768
        output_dim: 512
        n_layers: 6
        n_heads: 12
        window_size: 31
        use_rope: true
        rope_theta: 10000.0
        max_seq_len: 512
        use_flash_attention: true

    mel_decoder:
      class_path: linacodec.module.transformer.Transformer
      init_args:
        dim: 512
        output_dim: 100 # Number of mel frequency bins
        n_layers: 6
        n_heads: 8
        window_size: 65
        use_rope: true
        rope_theta: 10000.0
        max_seq_len: 512
        adanorm_condition_dim: 128 # Must match global encoder output dimension
        use_adaln_zero: true # Use AdaLNZero for conditioning
        use_flash_attention: true

    mel_postnet:
      class_path: linacodec.module.postnet.PostNet
      init_args:
        input_channels: 100 # Number of mel frequency bins
        channels: 256
        kernel_size: 7
        num_layers: 4
        use_layer_norm: true