Audio-to-Audio
Safetensors
torch
lucadellalib commited on
Commit
d8a2953
·
verified ·
1 Parent(s): 61c6adc

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. config.json +205 -0
  3. index.faiss +3 -0
  4. model.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ index.faiss filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "encoder_name": "WavLM",
3
+ "encoder_config": {
4
+ "hidden_dims": [
5
+ 512,
6
+ 512,
7
+ 512,
8
+ 512,
9
+ 512,
10
+ 512,
11
+ 512
12
+ ],
13
+ "kernel_sizes": [
14
+ 10,
15
+ 3,
16
+ 3,
17
+ 3,
18
+ 3,
19
+ 2,
20
+ 2
21
+ ],
22
+ "strides": [
23
+ 5,
24
+ 2,
25
+ 2,
26
+ 2,
27
+ 2,
28
+ 2,
29
+ 2
30
+ ],
31
+ "num_layers": 6,
32
+ "dim": 1024,
33
+ "ffn_dim": 4096,
34
+ "num_heads": 16,
35
+ "num_buckets": 320,
36
+ "max_distance": 800,
37
+ "max_cached_steps": 2048,
38
+ "dropout": 0.0,
39
+ "conv_pos": 128,
40
+ "conv_pos_groups": 16,
41
+ "causal": false,
42
+ "window_size": 512,
43
+ "lookahead_size": 3,
44
+ "use_flex_attention": false
45
+ },
46
+ "compressor_name": "FocalEncoder",
47
+ "compressor_config": {
48
+ "input_dim": 1024,
49
+ "output_dim": 32,
50
+ "hidden_dims": [
51
+ 1024,
52
+ 1024,
53
+ 1024
54
+ ],
55
+ "downscale_factors": [
56
+ 1,
57
+ 1,
58
+ 1
59
+ ],
60
+ "focal_window": 14,
61
+ "focal_level": 2,
62
+ "focal_factor": 4,
63
+ "dropout": 0.0,
64
+ "use_post_norm": false,
65
+ "use_layerscale": false,
66
+ "layerscale_init": 0.0001,
67
+ "tanhscale_init": 0.5,
68
+ "normalize_modulator": false,
69
+ "causal": false,
70
+ "window_size": 512
71
+ },
72
+ "boundary_predictor_name": "HazardModel",
73
+ "boundary_predictor_config": {
74
+ "input_dim": 1024,
75
+ "hidden_dims": [
76
+ 1024,
77
+ 1024,
78
+ 1024
79
+ ],
80
+ "downscale_factors": [
81
+ 1,
82
+ 1,
83
+ 1
84
+ ],
85
+ "focal_window": 14,
86
+ "focal_level": 2,
87
+ "focal_factor": 4,
88
+ "dropout": 0.0,
89
+ "use_post_norm": false,
90
+ "use_layerscale": false,
91
+ "layerscale_init": 0.0001,
92
+ "tanhscale_init": 0.5,
93
+ "normalize_modulator": false,
94
+ "causal": false,
95
+ "window_size": 512
96
+ },
97
+ "downsampler_name": "SelectLastPool",
98
+ "downsampler_config": {},
99
+ "quantizer_name": "ScalarSphericalQuantizer",
100
+ "quantizer_config": {
101
+ "dim": 32,
102
+ "n_levels": 4
103
+ },
104
+ "duration_predictor_name": "NegBinModel",
105
+ "duration_predictor_config": {
106
+ "input_dim": 32,
107
+ "hidden_dims": [
108
+ 1024,
109
+ 1024,
110
+ 1024
111
+ ],
112
+ "downscale_factors": [
113
+ 1,
114
+ 1,
115
+ 1
116
+ ],
117
+ "focal_window": 14,
118
+ "focal_level": 2,
119
+ "focal_factor": 4,
120
+ "dropout": 0.0,
121
+ "use_post_norm": false,
122
+ "use_layerscale": false,
123
+ "layerscale_init": 0.0001,
124
+ "tanhscale_init": 0.5,
125
+ "normalize_modulator": false,
126
+ "causal": false,
127
+ "window_size": 512,
128
+ "min_duration": 1,
129
+ "eps": 0.0001
130
+ },
131
+ "upsampler_name": "RepeatInterleaveUnpool",
132
+ "upsampler_config": {},
133
+ "decompressor_name": "FocalDecoder",
134
+ "decompressor_config": {
135
+ "input_dim": 32,
136
+ "output_dim": 1024,
137
+ "hidden_dims": [
138
+ 1024,
139
+ 1024,
140
+ 1024
141
+ ],
142
+ "upscale_factors": [
143
+ 1,
144
+ 1,
145
+ 1
146
+ ],
147
+ "focal_window": 14,
148
+ "focal_level": 2,
149
+ "focal_factor": 4,
150
+ "dropout": 0.0,
151
+ "use_post_norm": false,
152
+ "use_layerscale": false,
153
+ "layerscale_init": 0.0001,
154
+ "tanhscale_init": 0.5,
155
+ "normalize_modulator": false,
156
+ "causal": false,
157
+ "window_size": 512,
158
+ "last_window_size": 512,
159
+ "lookahead_size": 3
160
+ },
161
+ "decoder_name": "Vocos",
162
+ "decoder_config": {
163
+ "input_dim": 1024,
164
+ "num_layers": 8,
165
+ "dim": 512,
166
+ "ffn_dim": 1536,
167
+ "kernel_size": 7,
168
+ "layerscale_init": 0.125,
169
+ "n_fft": 1024,
170
+ "hop_length": 320,
171
+ "causal": false
172
+ },
173
+ "char_aligner_name": "MMS",
174
+ "char_aligner_config": {
175
+ "checkpoint": "facebook/mms-1b-all"
176
+ },
177
+ "retriever_name": "LatentIVF",
178
+ "retriever_config": {
179
+ "input_dim": 1024,
180
+ "latent_dim": 32,
181
+ "hidden_dims": [
182
+ 1024,
183
+ 1024,
184
+ 1024
185
+ ],
186
+ "downscale_factors": [
187
+ 1,
188
+ 1,
189
+ 1
190
+ ],
191
+ "focal_window": 14,
192
+ "focal_level": 2,
193
+ "focal_factor": 4,
194
+ "dropout": 0.0,
195
+ "use_post_norm": false,
196
+ "use_layerscale": false,
197
+ "layerscale_init": 0.0001,
198
+ "tanhscale_init": 0.5,
199
+ "normalize_modulator": false,
200
+ "causal": false,
201
+ "window_size": 512,
202
+ "nlist": 4096,
203
+ "nprobe": 16
204
+ }
205
+ }
index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ae5b8ad8f636ea180ecbd636150c10c2186f7df0f8eb7bd19658221142df6df
3
+ size 2884647272
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72203dd157effb733d751336602e6ee838e7e43a13e6c0e76685a5d86f19367b
3
+ size 1404061468