diff --git "a/layer_report.json" "b/layer_report.json" new file mode 100644--- /dev/null +++ "b/layer_report.json" @@ -0,0 +1,53851 @@ +{ + "pipeline": "dev", + "quantize": true, + "quantize_scope": "core", + "dtype": "bfloat16", + "layers": [ + { + "name": "patchify_proj.weight", + "shape": [ + 4096, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 1048576 + }, + { + "name": "patchify_proj.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "adaln_single.emb.timestep_embedder.linear1.weight", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "adaln_single.emb.timestep_embedder.linear1.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "adaln_single.emb.timestep_embedder.linear2.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 33554432 + }, + { + "name": "adaln_single.emb.timestep_embedder.linear2.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "adaln_single.linear.weight", + "shape": [ + 24576, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 201326592 + }, + { + "name": "adaln_single.linear.bias", + "shape": [ + 24576 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "caption_projection.linear1.weight", + "shape": [ + 4096, + 3840 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 31457280 + }, + { + "name": "caption_projection.linear1.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "caption_projection.linear2.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 33554432 + }, + { + "name": "caption_projection.linear2.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "scale_shift_table", + "shape": [ + 2, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "proj_out.weight", + "shape": [ + 128, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 1048576 + }, + { + "name": "proj_out.bias", + "shape": [ + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 256 + }, + { + "name": "audio_patchify_proj.weight", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "audio_patchify_proj.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "audio_adaln_single.emb.timestep_embedder.linear1.weight", + "shape": [ + 2048, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 1048576 + }, + { + "name": "audio_adaln_single.emb.timestep_embedder.linear1.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "audio_adaln_single.emb.timestep_embedder.linear2.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8388608 + }, + { + "name": "audio_adaln_single.emb.timestep_embedder.linear2.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "audio_adaln_single.linear.weight", + "shape": [ + 12288, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 50331648 + }, + { + "name": "audio_adaln_single.linear.bias", + "shape": [ + 12288 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "audio_caption_projection.linear1.weight", + "shape": [ + 2048, + 3840 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 15728640 + }, + { + "name": "audio_caption_projection.linear1.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "audio_caption_projection.linear2.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8388608 + }, + { + "name": "audio_caption_projection.linear2.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "audio_scale_shift_table", + "shape": [ + 2, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "audio_proj_out.weight", + "shape": [ + 128, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "audio_proj_out.bias", + "shape": [ + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 256 + }, + { + "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear1.weight", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear1.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear2.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 33554432 + }, + { + "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear2.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "av_ca_video_scale_shift_adaln_single.linear.weight", + "shape": [ + 16384, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 134217728 + }, + { + "name": "av_ca_video_scale_shift_adaln_single.linear.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear1.weight", + "shape": [ + 2048, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 1048576 + }, + { + "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear1.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear2.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8388608 + }, + { + "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear2.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "av_ca_audio_scale_shift_adaln_single.linear.weight", + "shape": [ + 8192, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 33554432 + }, + { + "name": "av_ca_audio_scale_shift_adaln_single.linear.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear1.weight", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear1.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear2.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 33554432 + }, + { + "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear2.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "av_ca_a2v_gate_adaln_single.linear.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 33554432 + }, + { + "name": "av_ca_a2v_gate_adaln_single.linear.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear1.weight", + "shape": [ + 2048, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 1048576 + }, + { + "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear1.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear2.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8388608 + }, + { + "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear2.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "av_ca_v2a_gate_adaln_single.linear.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8388608 + }, + { + "name": "av_ca_v2a_gate_adaln_single.linear.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.0.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.0.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.0.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.0.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.0.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.0.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.0.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.0.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.0.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.0.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.0.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.1.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.1.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.1.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.1.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.1.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.1.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.1.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.1.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.1.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.1.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.1.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.1.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.2.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.2.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.2.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.2.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.2.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.2.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.2.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.2.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.2.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.2.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.2.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.2.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.3.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.3.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.3.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.3.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.3.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.3.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.3.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.3.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.3.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.3.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.3.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.3.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.4.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.4.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.4.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.4.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.4.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.4.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.4.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.4.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.4.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.4.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.4.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.4.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.5.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.5.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.5.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.5.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.5.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.5.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.5.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.5.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.5.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.5.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.5.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.5.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.6.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.6.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.6.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.6.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.6.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.6.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.6.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.6.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.6.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.6.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.6.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.6.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.7.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.7.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.7.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.7.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.7.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.7.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.7.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.7.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.7.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.7.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.7.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.7.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.8.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.8.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.8.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.8.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.8.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.8.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.8.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.8.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.8.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.8.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.8.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.8.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.9.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.9.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.9.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.9.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.9.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.9.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.9.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.9.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.9.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.9.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.9.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.9.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.10.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.10.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.10.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.10.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.10.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.10.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.10.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.10.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.10.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.10.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.10.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.10.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.11.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.11.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.11.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.11.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.11.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.11.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.11.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.11.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.11.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.11.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.11.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.11.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.12.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.12.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.12.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.12.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.12.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.12.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.12.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.12.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.12.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.12.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.12.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.12.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.13.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.13.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.13.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.13.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.13.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.13.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.13.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.13.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.13.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.13.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.13.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.13.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.14.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.14.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.14.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.14.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.14.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.14.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.14.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.14.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.14.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.14.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.14.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.14.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.15.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.15.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.15.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.15.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.15.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.15.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.15.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.15.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.15.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.15.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.15.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.15.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.16.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.16.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.16.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.16.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.16.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.16.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.16.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.16.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.16.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.16.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.16.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.16.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.17.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.17.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.17.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.17.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.17.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.17.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.17.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.17.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.17.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.17.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.17.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.17.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.18.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.18.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.18.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.18.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.18.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.18.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.18.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.18.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.18.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.18.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.18.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.18.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.19.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.19.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.19.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.19.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.19.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.19.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.19.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.19.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.19.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.19.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.19.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.19.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.20.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.20.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.20.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.20.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.20.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.20.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.20.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.20.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.20.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.20.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.20.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.20.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.21.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.21.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.21.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.21.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.21.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.21.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.21.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.21.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.21.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.21.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.21.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.21.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.22.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.22.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.22.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.22.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.22.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.22.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.22.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.22.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.22.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.22.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.22.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.22.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.23.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.23.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.23.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.23.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.23.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.23.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.23.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.23.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.23.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.23.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.23.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.23.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.24.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.24.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.24.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.24.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.24.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.24.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.24.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.24.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.24.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.24.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.24.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.24.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.25.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.25.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.25.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.25.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.25.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.25.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.25.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.25.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.25.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.25.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.25.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.25.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.26.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.26.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.26.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.26.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.26.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.26.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.26.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.26.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.26.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.26.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.26.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.26.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.27.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.27.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.27.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.27.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.27.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.27.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.27.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.27.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.27.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.27.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.27.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.27.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.28.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.28.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.28.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.28.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.28.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.28.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.28.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.28.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.28.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.28.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.28.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.28.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.29.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.29.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.29.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.29.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.29.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.29.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.29.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.29.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.29.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.29.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.29.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.29.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.30.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.30.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.30.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.30.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.30.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.30.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.30.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.30.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.30.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.30.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.30.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.30.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.31.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.31.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.31.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.31.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.31.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.31.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.31.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.31.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.31.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.31.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.31.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.31.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.32.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.32.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.32.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.32.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.32.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.32.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.32.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.32.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.32.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.32.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.32.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.32.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.33.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.33.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.33.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.33.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.33.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.33.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.33.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.33.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.33.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.33.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.33.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.33.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.34.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.34.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.34.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.34.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.34.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.34.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.34.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.34.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.34.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.34.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.34.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.34.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.35.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.35.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.35.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.35.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.35.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.35.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.35.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.35.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.35.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.35.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.35.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.35.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.36.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.36.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.36.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.36.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.36.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.36.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.36.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.36.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.36.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.36.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.36.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.36.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.37.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.37.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.37.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.37.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.37.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.37.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.37.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.37.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.37.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.37.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.37.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.37.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.38.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.38.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.38.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.38.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.38.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.38.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.38.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.38.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.38.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.38.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.38.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.38.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.39.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.39.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.39.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.39.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.39.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.39.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.39.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.39.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.39.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.39.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.39.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.39.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.40.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.40.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.40.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.40.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.40.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.40.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.40.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.40.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.40.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.40.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.40.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.40.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.41.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.41.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.41.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.41.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.41.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.41.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.41.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.41.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.41.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.41.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.41.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.41.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.42.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.42.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.42.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.42.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.42.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.42.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.42.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.42.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.42.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.42.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.42.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.42.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.43.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.43.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.43.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.43.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.43.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.43.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.43.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.43.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.43.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.43.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.43.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.43.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.44.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.44.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.44.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.44.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.44.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.44.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.44.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.44.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.44.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.44.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.44.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.44.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.45.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.45.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.45.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.45.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.45.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.45.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.45.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.45.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.45.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.45.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.45.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.45.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.46.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.46.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.46.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.46.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.46.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.46.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.46.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.46.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.46.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.46.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.46.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.46.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + }, + { + "name": "transformer_blocks.47.attn1.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn1.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn1.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn1.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn1.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn1.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn1.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn1.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn1.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn1.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn1.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn2.to_q.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn2.to_q.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_q.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_q.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn2.to_k.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn2.to_k.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_k.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_k.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn2.to_v.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn2.to_v.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_v.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_v.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn2.q_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn2.k_norm.weight", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.attn2.to_out.weight", + "shape": [ + 4096, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.attn2.to_out.scales", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_out.biases", + "shape": [ + 4096, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.attn2.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.ff.proj_in.weight", + "shape": [ + 16384, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.47.ff.proj_in.scales", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.47.ff.proj_in.biases", + "shape": [ + 16384, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.47.ff.proj_in.bias", + "shape": [ + 16384 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 32768 + }, + { + "name": "transformer_blocks.47.ff.proj_out.weight", + "shape": [ + 4096, + 4096 + ], + "dtype": "mlx.core.uint32", + "nbytes": 67108864 + }, + { + "name": "transformer_blocks.47.ff.proj_out.scales", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.47.ff.proj_out.biases", + "shape": [ + 4096, + 256 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 2097152 + }, + { + "name": "transformer_blocks.47.ff.proj_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.scale_shift_table", + "shape": [ + 6, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 49152 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn1.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn1.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn1.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn2.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn2.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_attn2.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_in.weight", + "shape": [ + 8192, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_in.scales", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_in.biases", + "shape": [ + 8192, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_in.bias", + "shape": [ + 8192 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 16384 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_out.weight", + "shape": [ + 2048, + 2048 + ], + "dtype": "mlx.core.uint32", + "nbytes": 16777216 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_out.scales", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_out.biases", + "shape": [ + 2048, + 128 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 524288 + }, + { + "name": "transformer_blocks.47.audio_ff.proj_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_scale_shift_table", + "shape": [ + 6, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 24576 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_q.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_q.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_q.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_k.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_k.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_k.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_v.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_v.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_v.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_out.weight", + "shape": [ + 4096, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_out.scales", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_out.biases", + "shape": [ + 4096, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.audio_to_video_attn.to_out.bias", + "shape": [ + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 8192 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_q.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_q.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_q.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_q.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_k.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_k.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_k.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_k.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_v.weight", + "shape": [ + 2048, + 1024 + ], + "dtype": "mlx.core.uint32", + "nbytes": 8388608 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_v.scales", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_v.biases", + "shape": [ + 2048, + 64 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 262144 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_v.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.q_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.k_norm.weight", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_out.weight", + "shape": [ + 2048, + 512 + ], + "dtype": "mlx.core.uint32", + "nbytes": 4194304 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_out.scales", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_out.biases", + "shape": [ + 2048, + 32 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 131072 + }, + { + "name": "transformer_blocks.47.video_to_audio_attn.to_out.bias", + "shape": [ + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 4096 + }, + { + "name": "transformer_blocks.47.scale_shift_table_a2v_ca_audio", + "shape": [ + 5, + 2048 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 20480 + }, + { + "name": "transformer_blocks.47.scale_shift_table_a2v_ca_video", + "shape": [ + 5, + 4096 + ], + "dtype": "mlx.core.bfloat16", + "nbytes": 40960 + } + ], + "totals": { + "bytes": 20387930624, + "by_dtype": { + "mlx.core.bfloat16": 1865884160, + "mlx.core.uint32": 18522046464 + } + } +} \ No newline at end of file