YatharthS commited on
Commit
aac23d4
·
verified ·
1 Parent(s): 4b49e67

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.yaml +105 -0
  2. model.safetensors +3 -0
config.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ class_path: linacodec.model.LinaCodec
3
+ init_args:
4
+ config:
5
+ # SSL Feature settings
6
+ local_ssl_layers: [6, 9]
7
+ global_ssl_layers: [1, 2]
8
+ normalize_ssl_features: true
9
+
10
+ # Down/up-sampling settings
11
+ downsample_factor: 4
12
+ mel_upsample_factor: 8
13
+ use_conv_downsample: true
14
+ mel_interpolation_mode: linear
15
+
16
+ # Audio settings
17
+ sample_rate: 24000
18
+ n_fft: 1024
19
+ hop_length: 256
20
+ n_mels: 100
21
+ padding: center
22
+
23
+ ssl_feature_extractor:
24
+ class_path: linacodec.module.ssl_extractor.SSLFeatureExtractor
25
+ init_args:
26
+ model_name: wavlm_base_plus
27
+ output_layer: 2 # Use at most 2 layers
28
+ sample_rate: 24000 # Consistent to the target sample rate for reconstruction
29
+
30
+ local_encoder:
31
+ class_path: linacodec.module.transformer.Transformer
32
+ init_args:
33
+ dim: 768
34
+ n_layers: 6
35
+ n_heads: 12
36
+ window_size: 125
37
+ use_rope: true
38
+ rope_theta: 10000.0
39
+ max_seq_len: 512
40
+ use_flash_attention: true
41
+
42
+ local_quantizer:
43
+ class_path: linacodec.module.fsq.FiniteScalarQuantizer
44
+ init_args:
45
+ input_dim: 768 # Must match local encoder output dimension
46
+ output_dim: 768 # Must match feature decoder input dimension
47
+ levels: [8, 8, 8, 5, 5] # 12800
48
+
49
+ feature_decoder:
50
+ class_path: linacodec.module.transformer.Transformer
51
+ init_args:
52
+ dim: 768
53
+ n_layers: 6
54
+ n_heads: 12
55
+ window_size: 125
56
+ use_rope: true
57
+ rope_theta: 10000.0
58
+ max_seq_len: 512
59
+ use_flash_attention: true
60
+
61
+ global_encoder:
62
+ class_path: linacodec.module.global_encoder.GlobalEncoder
63
+ init_args:
64
+ input_channels: 768 # WavLM base plus feature dimension
65
+ output_channels: 128
66
+ num_layers: 4
67
+ dim: 384
68
+ intermediate_dim: 1152
69
+
70
+ mel_prenet:
71
+ class_path: linacodec.module.transformer.Transformer
72
+ init_args:
73
+ dim: 768
74
+ output_dim: 512
75
+ n_layers: 6
76
+ n_heads: 12
77
+ window_size: 31
78
+ use_rope: true
79
+ rope_theta: 10000.0
80
+ max_seq_len: 512
81
+ use_flash_attention: true
82
+
83
+ mel_decoder:
84
+ class_path: linacodec.module.transformer.Transformer
85
+ init_args:
86
+ dim: 512
87
+ output_dim: 100 # Number of mel frequency bins
88
+ n_layers: 6
89
+ n_heads: 8
90
+ window_size: 65
91
+ use_rope: true
92
+ rope_theta: 10000.0
93
+ max_seq_len: 512
94
+ adanorm_condition_dim: 128 # Must match global encoder output dimension
95
+ use_adaln_zero: true # Use AdaLNZero for conditioning
96
+ use_flash_attention: true
97
+
98
+ mel_postnet:
99
+ class_path: linacodec.module.postnet.PostNet
100
+ init_args:
101
+ input_channels: 100 # Number of mel frequency bins
102
+ channels: 256
103
+ kernel_size: 7
104
+ num_layers: 4
105
+ use_layer_norm: true
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f84c84a4d639bda5a6b514296279272f2aa4c842f9bc758d56a79fc2800d4c
3
+ size 479820604