{ "pipeline": "dev", "quantize": true, "quantize_scope": "core", "dtype": "bfloat16", "layers": [ { "name": "patchify_proj.weight", "shape": [ 4096, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 1048576 }, { "name": "patchify_proj.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "adaln_single.emb.timestep_embedder.linear1.weight", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "adaln_single.emb.timestep_embedder.linear1.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "adaln_single.emb.timestep_embedder.linear2.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 33554432 }, { "name": "adaln_single.emb.timestep_embedder.linear2.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "adaln_single.linear.weight", "shape": [ 24576, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 201326592 }, { "name": "adaln_single.linear.bias", "shape": [ 24576 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "caption_projection.linear1.weight", "shape": [ 4096, 3840 ], "dtype": "mlx.core.bfloat16", "nbytes": 31457280 }, { "name": "caption_projection.linear1.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "caption_projection.linear2.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 33554432 }, { "name": "caption_projection.linear2.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "scale_shift_table", "shape": [ 2, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "proj_out.weight", "shape": [ 128, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 1048576 }, { "name": "proj_out.bias", "shape": [ 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 256 }, { "name": "audio_patchify_proj.weight", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "audio_patchify_proj.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "audio_adaln_single.emb.timestep_embedder.linear1.weight", "shape": [ 2048, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 1048576 }, { "name": "audio_adaln_single.emb.timestep_embedder.linear1.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "audio_adaln_single.emb.timestep_embedder.linear2.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 8388608 }, { "name": "audio_adaln_single.emb.timestep_embedder.linear2.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "audio_adaln_single.linear.weight", "shape": [ 12288, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 50331648 }, { "name": "audio_adaln_single.linear.bias", "shape": [ 12288 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "audio_caption_projection.linear1.weight", "shape": [ 2048, 3840 ], "dtype": "mlx.core.bfloat16", "nbytes": 15728640 }, { "name": "audio_caption_projection.linear1.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "audio_caption_projection.linear2.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 8388608 }, { "name": "audio_caption_projection.linear2.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "audio_scale_shift_table", "shape": [ 2, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "audio_proj_out.weight", "shape": [ 128, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "audio_proj_out.bias", "shape": [ 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 256 }, { "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear1.weight", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear1.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear2.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 33554432 }, { "name": "av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear2.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "av_ca_video_scale_shift_adaln_single.linear.weight", "shape": [ 16384, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 134217728 }, { "name": "av_ca_video_scale_shift_adaln_single.linear.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear1.weight", "shape": [ 2048, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 1048576 }, { "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear1.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear2.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 8388608 }, { "name": "av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear2.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "av_ca_audio_scale_shift_adaln_single.linear.weight", "shape": [ 8192, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 33554432 }, { "name": "av_ca_audio_scale_shift_adaln_single.linear.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear1.weight", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear1.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear2.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 33554432 }, { "name": "av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear2.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "av_ca_a2v_gate_adaln_single.linear.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 33554432 }, { "name": "av_ca_a2v_gate_adaln_single.linear.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear1.weight", "shape": [ 2048, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 1048576 }, { "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear1.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear2.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 8388608 }, { "name": "av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear2.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "av_ca_v2a_gate_adaln_single.linear.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 8388608 }, { "name": "av_ca_v2a_gate_adaln_single.linear.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.0.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.0.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.0.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.0.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.0.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.0.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.0.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.0.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.0.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.0.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.0.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.0.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.0.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.0.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.1.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.1.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.1.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.1.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.1.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.1.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.1.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.1.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.1.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.1.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.1.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.1.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.1.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.1.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.1.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.2.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.2.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.2.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.2.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.2.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.2.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.2.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.2.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.2.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.2.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.2.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.2.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.2.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.2.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.2.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.3.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.3.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.3.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.3.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.3.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.3.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.3.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.3.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.3.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.3.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.3.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.3.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.3.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.3.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.3.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.4.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.4.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.4.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.4.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.4.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.4.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.4.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.4.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.4.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.4.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.4.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.4.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.4.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.4.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.4.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.5.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.5.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.5.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.5.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.5.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.5.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.5.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.5.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.5.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.5.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.5.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.5.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.5.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.5.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.5.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.6.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.6.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.6.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.6.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.6.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.6.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.6.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.6.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.6.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.6.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.6.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.6.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.6.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.6.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.6.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.7.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.7.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.7.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.7.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.7.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.7.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.7.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.7.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.7.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.7.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.7.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.7.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.7.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.7.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.7.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.8.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.8.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.8.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.8.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.8.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.8.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.8.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.8.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.8.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.8.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.8.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.8.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.8.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.8.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.8.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.9.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.9.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.9.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.9.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.9.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.9.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.9.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.9.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.9.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.9.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.9.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.9.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.9.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.9.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.9.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.10.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.10.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.10.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.10.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.10.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.10.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.10.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.10.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.10.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.10.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.10.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.10.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.10.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.10.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.10.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.11.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.11.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.11.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.11.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.11.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.11.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.11.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.11.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.11.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.11.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.11.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.11.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.11.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.11.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.11.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.12.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.12.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.12.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.12.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.12.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.12.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.12.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.12.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.12.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.12.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.12.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.12.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.12.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.12.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.12.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.13.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.13.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.13.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.13.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.13.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.13.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.13.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.13.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.13.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.13.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.13.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.13.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.13.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.13.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.13.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.14.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.14.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.14.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.14.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.14.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.14.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.14.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.14.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.14.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.14.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.14.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.14.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.14.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.14.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.14.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.15.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.15.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.15.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.15.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.15.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.15.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.15.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.15.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.15.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.15.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.15.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.15.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.15.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.15.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.15.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.16.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.16.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.16.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.16.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.16.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.16.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.16.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.16.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.16.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.16.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.16.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.16.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.16.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.16.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.16.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.17.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.17.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.17.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.17.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.17.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.17.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.17.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.17.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.17.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.17.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.17.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.17.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.17.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.17.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.17.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.18.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.18.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.18.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.18.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.18.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.18.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.18.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.18.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.18.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.18.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.18.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.18.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.18.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.18.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.18.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.19.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.19.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.19.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.19.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.19.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.19.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.19.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.19.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.19.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.19.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.19.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.19.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.19.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.19.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.19.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.20.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.20.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.20.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.20.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.20.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.20.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.20.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.20.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.20.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.20.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.20.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.20.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.20.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.20.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.20.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.21.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.21.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.21.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.21.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.21.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.21.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.21.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.21.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.21.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.21.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.21.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.21.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.21.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.21.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.21.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.22.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.22.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.22.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.22.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.22.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.22.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.22.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.22.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.22.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.22.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.22.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.22.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.22.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.22.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.22.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.23.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.23.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.23.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.23.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.23.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.23.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.23.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.23.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.23.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.23.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.23.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.23.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.23.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.23.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.23.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.24.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.24.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.24.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.24.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.24.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.24.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.24.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.24.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.24.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.24.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.24.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.24.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.24.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.24.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.24.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.25.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.25.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.25.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.25.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.25.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.25.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.25.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.25.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.25.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.25.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.25.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.25.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.25.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.25.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.25.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.26.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.26.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.26.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.26.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.26.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.26.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.26.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.26.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.26.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.26.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.26.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.26.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.26.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.26.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.26.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.27.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.27.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.27.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.27.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.27.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.27.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.27.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.27.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.27.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.27.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.27.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.27.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.27.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.27.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.27.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.28.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.28.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.28.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.28.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.28.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.28.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.28.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.28.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.28.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.28.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.28.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.28.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.28.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.28.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.28.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.29.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.29.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.29.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.29.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.29.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.29.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.29.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.29.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.29.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.29.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.29.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.29.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.29.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.29.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.29.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.30.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.30.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.30.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.30.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.30.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.30.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.30.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.30.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.30.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.30.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.30.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.30.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.30.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.30.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.30.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.31.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.31.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.31.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.31.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.31.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.31.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.31.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.31.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.31.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.31.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.31.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.31.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.31.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.31.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.31.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.32.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.32.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.32.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.32.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.32.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.32.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.32.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.32.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.32.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.32.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.32.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.32.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.32.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.32.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.32.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.33.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.33.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.33.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.33.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.33.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.33.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.33.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.33.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.33.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.33.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.33.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.33.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.33.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.33.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.33.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.34.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.34.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.34.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.34.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.34.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.34.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.34.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.34.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.34.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.34.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.34.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.34.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.34.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.34.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.34.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.35.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.35.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.35.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.35.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.35.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.35.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.35.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.35.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.35.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.35.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.35.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.35.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.35.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.35.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.35.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.36.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.36.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.36.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.36.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.36.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.36.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.36.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.36.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.36.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.36.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.36.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.36.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.36.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.36.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.36.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.37.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.37.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.37.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.37.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.37.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.37.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.37.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.37.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.37.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.37.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.37.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.37.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.37.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.37.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.37.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.38.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.38.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.38.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.38.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.38.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.38.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.38.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.38.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.38.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.38.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.38.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.38.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.38.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.38.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.38.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.39.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.39.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.39.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.39.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.39.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.39.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.39.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.39.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.39.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.39.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.39.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.39.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.39.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.39.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.39.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.40.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.40.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.40.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.40.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.40.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.40.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.40.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.40.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.40.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.40.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.40.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.40.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.40.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.40.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.40.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.41.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.41.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.41.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.41.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.41.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.41.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.41.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.41.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.41.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.41.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.41.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.41.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.41.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.41.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.41.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.42.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.42.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.42.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.42.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.42.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.42.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.42.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.42.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.42.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.42.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.42.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.42.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.42.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.42.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.42.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.43.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.43.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.43.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.43.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.43.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.43.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.43.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.43.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.43.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.43.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.43.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.43.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.43.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.43.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.43.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.44.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.44.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.44.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.44.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.44.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.44.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.44.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.44.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.44.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.44.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.44.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.44.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.44.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.44.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.44.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.45.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.45.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.45.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.45.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.45.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.45.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.45.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.45.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.45.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.45.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.45.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.45.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.45.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.45.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.45.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.46.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.46.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.46.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.46.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.46.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.46.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.46.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.46.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.46.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.46.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.46.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.46.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.46.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.46.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.46.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 }, { "name": "transformer_blocks.47.attn1.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn1.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn1.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn1.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn1.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn1.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn1.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn1.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn1.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn1.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn1.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn2.to_q.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn2.to_q.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_q.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_q.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn2.to_k.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn2.to_k.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_k.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_k.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn2.to_v.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn2.to_v.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_v.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_v.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn2.q_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn2.k_norm.weight", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.attn2.to_out.weight", "shape": [ 4096, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.attn2.to_out.scales", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_out.biases", "shape": [ 4096, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.attn2.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.ff.proj_in.weight", "shape": [ 16384, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.47.ff.proj_in.scales", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.47.ff.proj_in.biases", "shape": [ 16384, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.47.ff.proj_in.bias", "shape": [ 16384 ], "dtype": "mlx.core.bfloat16", "nbytes": 32768 }, { "name": "transformer_blocks.47.ff.proj_out.weight", "shape": [ 4096, 4096 ], "dtype": "mlx.core.uint32", "nbytes": 67108864 }, { "name": "transformer_blocks.47.ff.proj_out.scales", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.47.ff.proj_out.biases", "shape": [ 4096, 256 ], "dtype": "mlx.core.bfloat16", "nbytes": 2097152 }, { "name": "transformer_blocks.47.ff.proj_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.scale_shift_table", "shape": [ 6, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 49152 }, { "name": "transformer_blocks.47.audio_attn1.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn1.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn1.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn1.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn1.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn1.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn1.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn1.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn1.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn1.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn1.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn2.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn2.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn2.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn2.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn2.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn2.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn2.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn2.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_attn2.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_attn2.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_attn2.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_ff.proj_in.weight", "shape": [ 8192, 512 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.audio_ff.proj_in.scales", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.audio_ff.proj_in.biases", "shape": [ 8192, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.audio_ff.proj_in.bias", "shape": [ 8192 ], "dtype": "mlx.core.bfloat16", "nbytes": 16384 }, { "name": "transformer_blocks.47.audio_ff.proj_out.weight", "shape": [ 2048, 2048 ], "dtype": "mlx.core.uint32", "nbytes": 16777216 }, { "name": "transformer_blocks.47.audio_ff.proj_out.scales", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.audio_ff.proj_out.biases", "shape": [ 2048, 128 ], "dtype": "mlx.core.bfloat16", "nbytes": 524288 }, { "name": "transformer_blocks.47.audio_ff.proj_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_scale_shift_table", "shape": [ 6, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 24576 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_q.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_q.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_q.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_k.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_k.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_k.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_v.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_v.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_v.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_to_video_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_to_video_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_out.weight", "shape": [ 4096, 512 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_out.scales", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_out.biases", "shape": [ 4096, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.audio_to_video_attn.to_out.bias", "shape": [ 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 8192 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_q.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_q.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_q.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_q.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_k.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_k.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_k.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_k.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_v.weight", "shape": [ 2048, 1024 ], "dtype": "mlx.core.uint32", "nbytes": 8388608 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_v.scales", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_v.biases", "shape": [ 2048, 64 ], "dtype": "mlx.core.bfloat16", "nbytes": 262144 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_v.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.video_to_audio_attn.q_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.video_to_audio_attn.k_norm.weight", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_out.weight", "shape": [ 2048, 512 ], "dtype": "mlx.core.uint32", "nbytes": 4194304 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_out.scales", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_out.biases", "shape": [ 2048, 32 ], "dtype": "mlx.core.bfloat16", "nbytes": 131072 }, { "name": "transformer_blocks.47.video_to_audio_attn.to_out.bias", "shape": [ 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 4096 }, { "name": "transformer_blocks.47.scale_shift_table_a2v_ca_audio", "shape": [ 5, 2048 ], "dtype": "mlx.core.bfloat16", "nbytes": 20480 }, { "name": "transformer_blocks.47.scale_shift_table_a2v_ca_video", "shape": [ 5, 4096 ], "dtype": "mlx.core.bfloat16", "nbytes": 40960 } ], "totals": { "bytes": 20387930624, "by_dtype": { "mlx.core.bfloat16": 1865884160, "mlx.core.uint32": 18522046464 } } }