duckdbot commited on
Commit
2df72f1
·
verified ·
1 Parent(s): e1186d0

Upload MLX model via mlx-forge

Browse files
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: mlx
3
+ license: other
4
+ base_model: Lightricks/LTX-2.3
5
+ tags:
6
+ - mlx
7
+ - mlx-forge
8
+ - apple-silicon
9
+ - safetensors
10
+ ---
11
+
12
+ # audiohacking/ltx-2.3-mlx
13
+
14
+ MLX format conversion of [Lightricks/LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3).
15
+
16
+ Converted with [mlx-forge](https://github.com/dgrauet/mlx-forge).
17
+
18
+ - **Transformer variants:** distilled, dev
19
+ - **Model version:** 2.3.0
20
+
21
+ ## Files
22
+
23
+ - `audio_vae.safetensors` (101.57 MB)
24
+ - `config.json` (951.00 B)
25
+ - `connector.safetensors` (5.91 GB)
26
+ - `embedded_config.json` (7.06 KB)
27
+ - `ltx-2.3-22b-distilled-lora-384.safetensors` (7.08 GB)
28
+ - `spatial_upscaler_x1_5_v1_0.safetensors` (1.02 GB)
29
+ - `spatial_upscaler_x1_5_v1_0_config.json` (274.00 B)
30
+ - `spatial_upscaler_x2_v1_1.safetensors` (949.62 MB)
31
+ - `spatial_upscaler_x2_v1_1_config.json` (275.00 B)
32
+ - `split_model.json` (579.00 B)
33
+ - `temporal_upscaler_x2_v1_0.safetensors` (249.81 MB)
34
+ - `temporal_upscaler_x2_v1_0_config.json` (273.00 B)
35
+ - `transformer-dev.safetensors` (35.38 GB)
36
+ - `transformer-distilled.safetensors` (35.38 GB)
37
+ - `vae_decoder.safetensors` (776.62 MB)
38
+ - `vae_encoder.safetensors` (608.33 MB)
39
+ - `vocoder.safetensors` (246.35 MB)
audio_vae.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:442119db8eaf976f83a105cc8c5f2c930e0f625062f9776b10adcdb7bc96c416
3
+ size 106509048
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_version": "2.3.0",
3
+ "is_v2": true,
4
+ "model_type": "AudioVideo",
5
+ "num_attention_heads": 32,
6
+ "attention_head_dim": 128,
7
+ "in_channels": 128,
8
+ "out_channels": 128,
9
+ "num_layers": 48,
10
+ "cross_attention_dim": 4096,
11
+ "caption_channels": null,
12
+ "apply_gated_attention": true,
13
+ "audio_num_attention_heads": 32,
14
+ "audio_attention_head_dim": 64,
15
+ "audio_in_channels": 128,
16
+ "audio_out_channels": 128,
17
+ "audio_cross_attention_dim": 2048,
18
+ "positional_embedding_theta": 10000.0,
19
+ "positional_embedding_max_pos": [
20
+ 20,
21
+ 2048,
22
+ 2048
23
+ ],
24
+ "audio_positional_embedding_max_pos": [
25
+ 20
26
+ ],
27
+ "timestep_scale_multiplier": 1000,
28
+ "av_ca_timestep_scale_multiplier": 1000,
29
+ "norm_eps": 1e-06,
30
+ "connector_positional_embedding_max_pos": [
31
+ 4096
32
+ ],
33
+ "connector_rope_type": "SPLIT",
34
+ "variants": {
35
+ "distilled": {
36
+ "cross_attention_adaln": true
37
+ },
38
+ "dev": {
39
+ "cross_attention_adaln": true
40
+ }
41
+ }
42
+ }
connector.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5885c2bd7af4e761b3257cf5b17bc1fdb2d187cea3ca520ea723b6a129b7029f
3
+ size 6344495512
embedded_config.json ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer": {
3
+ "_class_name": "AVTransformer3DModel",
4
+ "activation_fn": "gelu-approximate",
5
+ "attention_bias": true,
6
+ "attention_head_dim": 128,
7
+ "attention_type": "default",
8
+ "caption_channels": 3840,
9
+ "cross_attention_dim": 4096,
10
+ "double_self_attention": false,
11
+ "dropout": 0.0,
12
+ "in_channels": 128,
13
+ "norm_elementwise_affine": false,
14
+ "norm_eps": 1e-06,
15
+ "norm_num_groups": 32,
16
+ "num_attention_heads": 32,
17
+ "num_embeds_ada_norm": 1000,
18
+ "num_layers": 48,
19
+ "num_vector_embeds": null,
20
+ "only_cross_attention": false,
21
+ "cross_attention_norm": true,
22
+ "out_channels": 128,
23
+ "upcast_attention": false,
24
+ "use_linear_projection": false,
25
+ "qk_norm": "rms_norm",
26
+ "standardization_norm": "rms_norm",
27
+ "positional_embedding_type": "rope",
28
+ "positional_embedding_theta": 10000.0,
29
+ "positional_embedding_max_pos": [
30
+ 20,
31
+ 2048,
32
+ 2048
33
+ ],
34
+ "timestep_scale_multiplier": 1000,
35
+ "av_ca_timestep_scale_multiplier": 1000.0,
36
+ "causal_temporal_positioning": true,
37
+ "audio_num_attention_heads": 32,
38
+ "audio_attention_head_dim": 64,
39
+ "use_audio_video_cross_attention": true,
40
+ "share_ff": false,
41
+ "audio_out_channels": 128,
42
+ "audio_cross_attention_dim": 2048,
43
+ "audio_positional_embedding_max_pos": [
44
+ 20
45
+ ],
46
+ "av_cross_ada_norm": true,
47
+ "use_embeddings_connector": true,
48
+ "connector_attention_head_dim": 128,
49
+ "connector_num_attention_heads": 32,
50
+ "connector_num_layers": 8,
51
+ "connector_positional_embedding_max_pos": [
52
+ 4096
53
+ ],
54
+ "connector_num_learnable_registers": 128,
55
+ "connector_norm_output": true,
56
+ "use_middle_indices_grid": true,
57
+ "apply_gated_attention": true,
58
+ "connector_apply_gated_attention": true,
59
+ "caption_projection_first_linear": false,
60
+ "caption_projection_second_linear": false,
61
+ "caption_proj_input_norm": false,
62
+ "connector_learnable_registers_std": 1,
63
+ "caption_proj_before_connector": true,
64
+ "audio_connector_attention_head_dim": 64,
65
+ "audio_connector_num_attention_heads": 32,
66
+ "cross_attention_adaln": true,
67
+ "rope_type": "split",
68
+ "frequencies_precision": "float64",
69
+ "text_encoder_norm_type": "PER_TOKEN_RMS"
70
+ },
71
+ "vae": {
72
+ "_class_name": "CausalVideoAutoencoder",
73
+ "dims": 3,
74
+ "in_channels": 3,
75
+ "out_channels": 3,
76
+ "latent_channels": 128,
77
+ "encoder_blocks": [
78
+ [
79
+ "res_x",
80
+ {
81
+ "num_layers": 4
82
+ }
83
+ ],
84
+ [
85
+ "compress_space_res",
86
+ {
87
+ "multiplier": 2
88
+ }
89
+ ],
90
+ [
91
+ "res_x",
92
+ {
93
+ "num_layers": 6
94
+ }
95
+ ],
96
+ [
97
+ "compress_time_res",
98
+ {
99
+ "multiplier": 2
100
+ }
101
+ ],
102
+ [
103
+ "res_x",
104
+ {
105
+ "num_layers": 4
106
+ }
107
+ ],
108
+ [
109
+ "compress_all_res",
110
+ {
111
+ "multiplier": 2
112
+ }
113
+ ],
114
+ [
115
+ "res_x",
116
+ {
117
+ "num_layers": 2
118
+ }
119
+ ],
120
+ [
121
+ "compress_all_res",
122
+ {
123
+ "multiplier": 1
124
+ }
125
+ ],
126
+ [
127
+ "res_x",
128
+ {
129
+ "num_layers": 2
130
+ }
131
+ ]
132
+ ],
133
+ "decoder_blocks": [
134
+ [
135
+ "res_x",
136
+ {
137
+ "num_layers": 4
138
+ }
139
+ ],
140
+ [
141
+ "compress_space",
142
+ {
143
+ "multiplier": 2
144
+ }
145
+ ],
146
+ [
147
+ "res_x",
148
+ {
149
+ "num_layers": 6
150
+ }
151
+ ],
152
+ [
153
+ "compress_time",
154
+ {
155
+ "multiplier": 2
156
+ }
157
+ ],
158
+ [
159
+ "res_x",
160
+ {
161
+ "num_layers": 4
162
+ }
163
+ ],
164
+ [
165
+ "compress_all",
166
+ {
167
+ "multiplier": 1
168
+ }
169
+ ],
170
+ [
171
+ "res_x",
172
+ {
173
+ "num_layers": 2
174
+ }
175
+ ],
176
+ [
177
+ "compress_all",
178
+ {
179
+ "multiplier": 2
180
+ }
181
+ ],
182
+ [
183
+ "res_x",
184
+ {
185
+ "num_layers": 2
186
+ }
187
+ ]
188
+ ],
189
+ "scaling_factor": 1.0,
190
+ "norm_layer": "pixel_norm",
191
+ "patch_size": 4,
192
+ "latent_log_var": "uniform",
193
+ "use_quant_conv": false,
194
+ "causal_decoder": false,
195
+ "timestep_conditioning": false,
196
+ "normalize_latent_channels": false,
197
+ "encoder_base_channels": 128,
198
+ "decoder_base_channels": 128,
199
+ "spatial_padding_mode": "zeros"
200
+ },
201
+ "scheduler": {
202
+ "_class_name": "RectifiedFlowScheduler",
203
+ "_diffusers_version": "0.25.1",
204
+ "num_train_timesteps": 1000,
205
+ "shifting": null,
206
+ "base_resolution": null,
207
+ "sampler": "LinearQuadratic"
208
+ },
209
+ "audio_vae": {
210
+ "model": {
211
+ "params": {
212
+ "ddconfig": {
213
+ "double_z": true,
214
+ "mel_bins": 64,
215
+ "z_channels": 8,
216
+ "resolution": 256,
217
+ "downsample_time": false,
218
+ "in_channels": 2,
219
+ "out_ch": 2,
220
+ "ch": 128,
221
+ "ch_mult": [
222
+ 1,
223
+ 2,
224
+ 4
225
+ ],
226
+ "num_res_blocks": 2,
227
+ "attn_resolutions": [],
228
+ "dropout": 0.0,
229
+ "mid_block_add_attention": false,
230
+ "norm_type": "pixel",
231
+ "causality_axis": "height"
232
+ },
233
+ "sampling_rate": 16000
234
+ }
235
+ },
236
+ "preprocessing": {
237
+ "audio": {
238
+ "sampling_rate": 16000,
239
+ "max_wav_value": 32768.0,
240
+ "duration": 5.12,
241
+ "stereo": true,
242
+ "causal_padding": 3
243
+ },
244
+ "stft": {
245
+ "filter_length": 1024,
246
+ "hop_length": 160,
247
+ "win_length": 1024,
248
+ "causal": true
249
+ },
250
+ "mel": {
251
+ "n_mel_channels": 64,
252
+ "mel_fmin": 0,
253
+ "mel_fmax": 8000
254
+ }
255
+ }
256
+ },
257
+ "vocoder": {
258
+ "vocoder": {
259
+ "upsample_initial_channel": 1536,
260
+ "resblock": "AMP1",
261
+ "upsample_rates": [
262
+ 5,
263
+ 2,
264
+ 2,
265
+ 2,
266
+ 2,
267
+ 2
268
+ ],
269
+ "resblock_kernel_sizes": [
270
+ 3,
271
+ 7,
272
+ 11
273
+ ],
274
+ "upsample_kernel_sizes": [
275
+ 11,
276
+ 4,
277
+ 4,
278
+ 4,
279
+ 4,
280
+ 4
281
+ ],
282
+ "resblock_dilation_sizes": [
283
+ [
284
+ 1,
285
+ 3,
286
+ 5
287
+ ],
288
+ [
289
+ 1,
290
+ 3,
291
+ 5
292
+ ],
293
+ [
294
+ 1,
295
+ 3,
296
+ 5
297
+ ]
298
+ ],
299
+ "stereo": true,
300
+ "use_tanh_at_final": false,
301
+ "activation": "snakebeta",
302
+ "use_bias_at_final": false
303
+ },
304
+ "bwe": {
305
+ "upsample_initial_channel": 512,
306
+ "resblock": "AMP1",
307
+ "upsample_rates": [
308
+ 6,
309
+ 5,
310
+ 2,
311
+ 2,
312
+ 2
313
+ ],
314
+ "resblock_kernel_sizes": [
315
+ 3,
316
+ 7,
317
+ 11
318
+ ],
319
+ "upsample_kernel_sizes": [
320
+ 12,
321
+ 11,
322
+ 4,
323
+ 4,
324
+ 4
325
+ ],
326
+ "resblock_dilation_sizes": [
327
+ [
328
+ 1,
329
+ 3,
330
+ 5
331
+ ],
332
+ [
333
+ 1,
334
+ 3,
335
+ 5
336
+ ],
337
+ [
338
+ 1,
339
+ 3,
340
+ 5
341
+ ]
342
+ ],
343
+ "stereo": true,
344
+ "use_tanh_at_final": false,
345
+ "activation": "snakebeta",
346
+ "use_bias_at_final": false,
347
+ "apply_final_activation": false,
348
+ "input_sampling_rate": 16000,
349
+ "output_sampling_rate": 48000,
350
+ "hop_length": 80,
351
+ "n_fft": 512,
352
+ "win_size": 512,
353
+ "num_mels": 64
354
+ }
355
+ }
356
+ }
ltx-2.3-22b-distilled-lora-384.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2943ab994f3c9d88052e5a2a34cca14e4a2dfc36b1d8c407931d52d5c25dd72b
3
+ size 7605507256
spatial_upscaler_x1_5_v1_0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e186732be14a71b953efed15181cbb94595fa836555a07b3b7560d6cf5deeb15
3
+ size 1090127476
spatial_upscaler_x1_5_v1_0_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "_class_name": "LatentUpsampler",
4
+ "in_channels": 128,
5
+ "mid_channels": 1024,
6
+ "num_blocks_per_stage": 4,
7
+ "dims": 3,
8
+ "spatial_upsample": true,
9
+ "temporal_upsample": false,
10
+ "spatial_scale": 1.5,
11
+ "rational_resampler": true
12
+ }
13
+ }
spatial_upscaler_x2_v1_1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:755f69dcb8a3f9589045925eb4a2237e9969f9c026a016393bc395e65ebd5788
3
+ size 995745061
spatial_upscaler_x2_v1_1_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "_class_name": "LatentUpsampler",
4
+ "in_channels": 128,
5
+ "mid_channels": 1024,
6
+ "num_blocks_per_stage": 4,
7
+ "dims": 3,
8
+ "spatial_upsample": true,
9
+ "temporal_upsample": false,
10
+ "spatial_scale": 2.0,
11
+ "rational_resampler": false
12
+ }
13
+ }
split_model.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "format": "split",
3
+ "model_version": "2.3.0",
4
+ "components": [
5
+ "connector",
6
+ "vae_decoder",
7
+ "vae_encoder",
8
+ "audio_vae",
9
+ "vocoder",
10
+ "spatial_upscaler_x2_v1_1",
11
+ "spatial_upscaler_x1_5_v1_0",
12
+ "temporal_upscaler_x2_v1_0"
13
+ ],
14
+ "transformer_variants": [
15
+ "distilled",
16
+ "dev"
17
+ ],
18
+ "lora": [
19
+ "ltx-2.3-22b-distilled-lora-384.safetensors"
20
+ ],
21
+ "source": "Lightricks/LTX-2.3",
22
+ "notes": {
23
+ "vocoder": "Also contains BWE (bandwidth extension) generator weights \u2014 upsample layers [6,5,2,2,2] (240x) and mel_stft parameters."
24
+ }
25
+ }
temporal_upscaler_x2_v1_0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:982bc28b455ed22de1e39a9fcf5213299de4e15f948ae3bd2cee65550e019628
3
+ size 261945581
temporal_upscaler_x2_v1_0_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "_class_name": "LatentUpsampler",
4
+ "in_channels": 128,
5
+ "mid_channels": 512,
6
+ "num_blocks_per_stage": 4,
7
+ "dims": 3,
8
+ "spatial_upsample": false,
9
+ "temporal_upsample": true,
10
+ "spatial_scale": 1.0,
11
+ "rational_resampler": true
12
+ }
13
+ }
transformer-dev.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd1f253fd2df2a3c8b7bf6103ccf945f4e3a4005d31568a7d4e8a4d49f6f286c
3
+ size 37987706193
transformer-distilled.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd1f253fd2df2a3c8b7bf6103ccf945f4e3a4005d31568a7d4e8a4d49f6f286c
3
+ size 37987706193
vae_decoder.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f404026fe0b59418eaec4a3fcdc474125c798f0b787dc390f6eb4e79934d4160
3
+ size 814349531
vae_encoder.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1512f5ef8387901db0baecb328690676e2667ddf2b63e28f8e6645188c1a8c06
3
+ size 637885319
vocoder.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:410cee86c14f76956bf487d153211ef15050fe8bf51f86d47a1d27ca6e3ba287
3
+ size 258313851