duckdbot commited on Apr 16

Commit

2df72f1

verified ·

1 Parent(s): e1186d0

Upload MLX model via mlx-forge

Browse files

Files changed (18) hide show

README.md +39 -0
audio_vae.safetensors +3 -0
config.json +42 -0
connector.safetensors +3 -0
embedded_config.json +356 -0
ltx-2.3-22b-distilled-lora-384.safetensors +3 -0
spatial_upscaler_x1_5_v1_0.safetensors +3 -0
spatial_upscaler_x1_5_v1_0_config.json +13 -0
spatial_upscaler_x2_v1_1.safetensors +3 -0
spatial_upscaler_x2_v1_1_config.json +13 -0
split_model.json +25 -0
temporal_upscaler_x2_v1_0.safetensors +3 -0
temporal_upscaler_x2_v1_0_config.json +13 -0
transformer-dev.safetensors +3 -0
transformer-distilled.safetensors +3 -0
vae_decoder.safetensors +3 -0
vae_encoder.safetensors +3 -0
vocoder.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+library_name: mlx
+license: other
+base_model: Lightricks/LTX-2.3
+tags:
+  - mlx
+  - mlx-forge
+  - apple-silicon
+  - safetensors
+---
+# audiohacking/ltx-2.3-mlx
+MLX format conversion of [Lightricks/LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3).
+Converted with [mlx-forge](https://github.com/dgrauet/mlx-forge).
+- **Transformer variants:** distilled, dev
+- **Model version:** 2.3.0
+## Files
+- `audio_vae.safetensors` (101.57 MB)
+- `config.json` (951.00 B)
+- `connector.safetensors` (5.91 GB)
+- `embedded_config.json` (7.06 KB)
+- `ltx-2.3-22b-distilled-lora-384.safetensors` (7.08 GB)
+- `spatial_upscaler_x1_5_v1_0.safetensors` (1.02 GB)
+- `spatial_upscaler_x1_5_v1_0_config.json` (274.00 B)
+- `spatial_upscaler_x2_v1_1.safetensors` (949.62 MB)
+- `spatial_upscaler_x2_v1_1_config.json` (275.00 B)
+- `split_model.json` (579.00 B)
+- `temporal_upscaler_x2_v1_0.safetensors` (249.81 MB)
+- `temporal_upscaler_x2_v1_0_config.json` (273.00 B)
+- `transformer-dev.safetensors` (35.38 GB)
+- `transformer-distilled.safetensors` (35.38 GB)
+- `vae_decoder.safetensors` (776.62 MB)
+- `vae_encoder.safetensors` (608.33 MB)
+- `vocoder.safetensors` (246.35 MB)

audio_vae.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:442119db8eaf976f83a105cc8c5f2c930e0f625062f9776b10adcdb7bc96c416
+size 106509048

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "model_version": "2.3.0",
+  "is_v2": true,
+  "model_type": "AudioVideo",
+  "num_attention_heads": 32,
+  "attention_head_dim": 128,
+  "in_channels": 128,
+  "out_channels": 128,
+  "num_layers": 48,
+  "cross_attention_dim": 4096,
+  "caption_channels": null,
+  "apply_gated_attention": true,
+  "audio_num_attention_heads": 32,
+  "audio_attention_head_dim": 64,
+  "audio_in_channels": 128,
+  "audio_out_channels": 128,
+  "audio_cross_attention_dim": 2048,
+  "positional_embedding_theta": 10000.0,
+  "positional_embedding_max_pos": [
+    20,
+    2048,
+    2048
+  ],
+  "audio_positional_embedding_max_pos": [
+    20
+  ],
+  "timestep_scale_multiplier": 1000,
+  "av_ca_timestep_scale_multiplier": 1000,
+  "norm_eps": 1e-06,
+  "connector_positional_embedding_max_pos": [
+    4096
+  ],
+  "connector_rope_type": "SPLIT",
+  "variants": {
+    "distilled": {
+      "cross_attention_adaln": true
+    },
+    "dev": {
+      "cross_attention_adaln": true
+    }
+  }
+}

connector.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5885c2bd7af4e761b3257cf5b17bc1fdb2d187cea3ca520ea723b6a129b7029f
+size 6344495512

embedded_config.json ADDED Viewed

	@@ -0,0 +1,356 @@

+{
+  "transformer": {
+    "_class_name": "AVTransformer3DModel",
+    "activation_fn": "gelu-approximate",
+    "attention_bias": true,
+    "attention_head_dim": 128,
+    "attention_type": "default",
+    "caption_channels": 3840,
+    "cross_attention_dim": 4096,
+    "double_self_attention": false,
+    "dropout": 0.0,
+    "in_channels": 128,
+    "norm_elementwise_affine": false,
+    "norm_eps": 1e-06,
+    "norm_num_groups": 32,
+    "num_attention_heads": 32,
+    "num_embeds_ada_norm": 1000,
+    "num_layers": 48,
+    "num_vector_embeds": null,
+    "only_cross_attention": false,
+    "cross_attention_norm": true,
+    "out_channels": 128,
+    "upcast_attention": false,
+    "use_linear_projection": false,
+    "qk_norm": "rms_norm",
+    "standardization_norm": "rms_norm",
+    "positional_embedding_type": "rope",
+    "positional_embedding_theta": 10000.0,
+    "positional_embedding_max_pos": [
+      20,
+      2048,
+      2048
+    ],
+    "timestep_scale_multiplier": 1000,
+    "av_ca_timestep_scale_multiplier": 1000.0,
+    "causal_temporal_positioning": true,
+    "audio_num_attention_heads": 32,
+    "audio_attention_head_dim": 64,
+    "use_audio_video_cross_attention": true,
+    "share_ff": false,
+    "audio_out_channels": 128,
+    "audio_cross_attention_dim": 2048,
+    "audio_positional_embedding_max_pos": [
+      20
+    ],
+    "av_cross_ada_norm": true,
+    "use_embeddings_connector": true,
+    "connector_attention_head_dim": 128,
+    "connector_num_attention_heads": 32,
+    "connector_num_layers": 8,
+    "connector_positional_embedding_max_pos": [
+      4096
+    ],
+    "connector_num_learnable_registers": 128,
+    "connector_norm_output": true,
+    "use_middle_indices_grid": true,
+    "apply_gated_attention": true,
+    "connector_apply_gated_attention": true,
+    "caption_projection_first_linear": false,
+    "caption_projection_second_linear": false,
+    "caption_proj_input_norm": false,
+    "connector_learnable_registers_std": 1,
+    "caption_proj_before_connector": true,
+    "audio_connector_attention_head_dim": 64,
+    "audio_connector_num_attention_heads": 32,
+    "cross_attention_adaln": true,
+    "rope_type": "split",
+    "frequencies_precision": "float64",
+    "text_encoder_norm_type": "PER_TOKEN_RMS"
+  },
+  "vae": {
+    "_class_name": "CausalVideoAutoencoder",
+    "dims": 3,
+    "in_channels": 3,
+    "out_channels": 3,
+    "latent_channels": 128,
+    "encoder_blocks": [
+      [
+        "res_x",
+        {
+          "num_layers": 4
+        }
+      ],
+      [
+        "compress_space_res",
+        {
+          "multiplier": 2
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 6
+        }
+      ],
+      [
+        "compress_time_res",
+        {
+          "multiplier": 2
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 4
+        }
+      ],
+      [
+        "compress_all_res",
+        {
+          "multiplier": 2
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 2
+        }
+      ],
+      [
+        "compress_all_res",
+        {
+          "multiplier": 1
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 2
+        }
+      ]
+    ],
+    "decoder_blocks": [
+      [
+        "res_x",
+        {
+          "num_layers": 4
+        }
+      ],
+      [
+        "compress_space",
+        {
+          "multiplier": 2
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 6
+        }
+      ],
+      [
+        "compress_time",
+        {
+          "multiplier": 2
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 4
+        }
+      ],
+      [
+        "compress_all",
+        {
+          "multiplier": 1
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 2
+        }
+      ],
+      [
+        "compress_all",
+        {
+          "multiplier": 2
+        }
+      ],
+      [
+        "res_x",
+        {
+          "num_layers": 2
+        }
+      ]
+    ],
+    "scaling_factor": 1.0,
+    "norm_layer": "pixel_norm",
+    "patch_size": 4,
+    "latent_log_var": "uniform",
+    "use_quant_conv": false,
+    "causal_decoder": false,
+    "timestep_conditioning": false,
+    "normalize_latent_channels": false,
+    "encoder_base_channels": 128,
+    "decoder_base_channels": 128,
+    "spatial_padding_mode": "zeros"
+  },
+  "scheduler": {
+    "_class_name": "RectifiedFlowScheduler",
+    "_diffusers_version": "0.25.1",
+    "num_train_timesteps": 1000,
+    "shifting": null,
+    "base_resolution": null,
+    "sampler": "LinearQuadratic"
+  },
+  "audio_vae": {
+    "model": {
+      "params": {
+        "ddconfig": {
+          "double_z": true,
+          "mel_bins": 64,
+          "z_channels": 8,
+          "resolution": 256,
+          "downsample_time": false,
+          "in_channels": 2,
+          "out_ch": 2,
+          "ch": 128,
+          "ch_mult": [
+            1,
+            2,
+            4
+          ],
+          "num_res_blocks": 2,
+          "attn_resolutions": [],
+          "dropout": 0.0,
+          "mid_block_add_attention": false,
+          "norm_type": "pixel",
+          "causality_axis": "height"
+        },
+        "sampling_rate": 16000
+      }
+    },
+    "preprocessing": {
+      "audio": {
+        "sampling_rate": 16000,
+        "max_wav_value": 32768.0,
+        "duration": 5.12,
+        "stereo": true,
+        "causal_padding": 3
+      },
+      "stft": {
+        "filter_length": 1024,
+        "hop_length": 160,
+        "win_length": 1024,
+        "causal": true
+      },
+      "mel": {
+        "n_mel_channels": 64,
+        "mel_fmin": 0,
+        "mel_fmax": 8000
+      }
+    }
+  },
+  "vocoder": {
+    "vocoder": {
+      "upsample_initial_channel": 1536,
+      "resblock": "AMP1",
+      "upsample_rates": [
+        5,
+        2,
+        2,
+        2,
+        2,
+        2
+      ],
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "upsample_kernel_sizes": [
+        11,
+        4,
+        4,
+        4,
+        4,
+        4
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ],
+      "stereo": true,
+      "use_tanh_at_final": false,
+      "activation": "snakebeta",
+      "use_bias_at_final": false
+    },
+    "bwe": {
+      "upsample_initial_channel": 512,
+      "resblock": "AMP1",
+      "upsample_rates": [
+        6,
+        5,
+        2,
+        2,
+        2
+      ],
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "upsample_kernel_sizes": [
+        12,
+        11,
+        4,
+        4,
+        4
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ],
+      "stereo": true,
+      "use_tanh_at_final": false,
+      "activation": "snakebeta",
+      "use_bias_at_final": false,
+      "apply_final_activation": false,
+      "input_sampling_rate": 16000,
+      "output_sampling_rate": 48000,
+      "hop_length": 80,
+      "n_fft": 512,
+      "win_size": 512,
+      "num_mels": 64
+    }
+  }
+}

ltx-2.3-22b-distilled-lora-384.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2943ab994f3c9d88052e5a2a34cca14e4a2dfc36b1d8c407931d52d5c25dd72b
+size 7605507256

spatial_upscaler_x1_5_v1_0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e186732be14a71b953efed15181cbb94595fa836555a07b3b7560d6cf5deeb15
+size 1090127476

spatial_upscaler_x1_5_v1_0_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "config": {
+    "_class_name": "LatentUpsampler",
+    "in_channels": 128,
+    "mid_channels": 1024,
+    "num_blocks_per_stage": 4,
+    "dims": 3,
+    "spatial_upsample": true,
+    "temporal_upsample": false,
+    "spatial_scale": 1.5,
+    "rational_resampler": true
+  }
+}

spatial_upscaler_x2_v1_1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:755f69dcb8a3f9589045925eb4a2237e9969f9c026a016393bc395e65ebd5788
+size 995745061

spatial_upscaler_x2_v1_1_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "config": {
+    "_class_name": "LatentUpsampler",
+    "in_channels": 128,
+    "mid_channels": 1024,
+    "num_blocks_per_stage": 4,
+    "dims": 3,
+    "spatial_upsample": true,
+    "temporal_upsample": false,
+    "spatial_scale": 2.0,
+    "rational_resampler": false
+  }
+}

split_model.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "format": "split",
+  "model_version": "2.3.0",
+  "components": [
+    "connector",
+    "vae_decoder",
+    "vae_encoder",
+    "audio_vae",
+    "vocoder",
+    "spatial_upscaler_x2_v1_1",
+    "spatial_upscaler_x1_5_v1_0",
+    "temporal_upscaler_x2_v1_0"
+  ],
+  "transformer_variants": [
+    "distilled",
+    "dev"
+  ],
+  "lora": [
+    "ltx-2.3-22b-distilled-lora-384.safetensors"
+  ],
+  "source": "Lightricks/LTX-2.3",
+  "notes": {
+    "vocoder": "Also contains BWE (bandwidth extension) generator weights \u2014 upsample layers [6,5,2,2,2] (240x) and mel_stft parameters."
+  }
+}

temporal_upscaler_x2_v1_0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:982bc28b455ed22de1e39a9fcf5213299de4e15f948ae3bd2cee65550e019628
+size 261945581

temporal_upscaler_x2_v1_0_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "config": {
+    "_class_name": "LatentUpsampler",
+    "in_channels": 128,
+    "mid_channels": 512,
+    "num_blocks_per_stage": 4,
+    "dims": 3,
+    "spatial_upsample": false,
+    "temporal_upsample": true,
+    "spatial_scale": 1.0,
+    "rational_resampler": true
+  }
+}

transformer-dev.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd1f253fd2df2a3c8b7bf6103ccf945f4e3a4005d31568a7d4e8a4d49f6f286c
+size 37987706193

transformer-distilled.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd1f253fd2df2a3c8b7bf6103ccf945f4e3a4005d31568a7d4e8a4d49f6f286c
+size 37987706193

vae_decoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f404026fe0b59418eaec4a3fcdc474125c798f0b787dc390f6eb4e79934d4160
+size 814349531

vae_encoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1512f5ef8387901db0baecb328690676e2667ddf2b63e28f8e6645188c1a8c06
+size 637885319

vocoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:410cee86c14f76956bf487d153211ef15050fe8bf51f86d47a1d27ca6e3ba287
+size 258313851