diff --git "a/f16_c16_patch1_xl/logEma.txt" "b/f16_c16_patch1_xl/logEma.txt" new file mode 100644--- /dev/null +++ "b/f16_c16_patch1_xl/logEma.txt" @@ -0,0 +1,4164 @@ +Using devices [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)] +Device count 4 +Global device count 4 +Global Batch: 512 +Node Batch: 512 +Device Batch: 128 +/tmp/tmpqm3credt +Loading dataset +Loading dataset +creating model +beta1: 0.9 +beta2: 0.999 +bootstrap_cfg: 1 +bootstrap_dt_bias: 0 +bootstrap_ema: 1 +bootstrap_every: 8 +cfg_scale: 1.5 +class_dropout_prob: 0.1 +denoise_timesteps: 128 +depth: 28 +dropout: 0.0 +dt_sampling: uniform +hidden_size: 1152 +lr: 0.0001 +mlp_ratio: 4 +num_classes: 1000 +num_heads: 16 +patch_size: 1 +sharding: dp +t_sampling: discrete-dt +target_update_rate: 0.999 +train_type: naive +use_cosine: 0 +use_ema: 0 +use_stable_vae: 1 +warmup: 0 +weight_decay: 0.1 + +Total devices TPU_0(process=0,(0,0,0,0)) +Initializing encoder. +Incoming encoder shape (1, 256, 256, 3) +Encoder layer (1, 256, 256, 128) +doing downsample +Encoder layer (1, 128, 128, 128) +doing downsample +Encoder layer (1, 64, 64, 128) +doing downsample +Encoder layer (1, 32, 32, 256) +doing downsample +Encoder layer (1, 16, 16, 256) +Encoder layer (1, 16, 16, 512) +Encoder layer final (1, 16, 16, 512) +Encoder layer final (1, 16, 16, 512) +Final embeddings are size (1, 16, 16, 16) +After quant (1, 16, 16, 16) +encode finished +Decoder incoming shape (1, 16, 16, 16) +Decoder input (1, 16, 16, 512) +Mid Block Decoder layer (1, 16, 16, 512) +Mid Block Decoder layer (1, 16, 16, 512) +Decoder layer (1, 32, 32, 512) +Decoder layer (1, 64, 64, 256) +Decoder layer (1, 128, 128, 256) +Decoder layer (1, 256, 256, 128) +Decoder layer (1, 256, 256, 128) +Total num of VQVAE parameters: 53024403 +Disc shape (1, 128, 128, 128) +Disc shape (1, 64, 64, 256) +Disc shape (1, 32, 32, 512) +Disc shape (1, 16, 16, 512) +Disc shape (1, 8, 8, 512) +Disc shape (1, 4, 4, 512) +Total num of Discriminator parameters: 23998017 +Loaded checkpoint from 3679702 seconds ago. +Loaded model with step 989001 +┌──────────────────────────────────────────────────────────────────────────────┐ +│ TPU 0 │ +├──────────────────────────────────────────────────────────────────────────────┤ +│ TPU 1 │ +├──────────────────────────────────────────────────────────────────────────────┤ +│ TPU 2 │ +├──────────────────────────────────────────────────────────────────────────────┤ +│ TPU 3 │ +└──────────────────────────────────────────────────────────────────────────────┘ +returning model +model done +Input to vae (4, 1, 256, 256, 3) +encode image shape (1, 256, 256, 3) +Initializing encoder. +Incoming encoder shape (1, 256, 256, 3) +Encoder layer (1, 256, 256, 128) +doing downsample +Encoder layer (1, 128, 128, 128) +doing downsample +Encoder layer (1, 64, 64, 128) +doing downsample +Encoder layer (1, 32, 32, 256) +doing downsample +Encoder layer (1, 16, 16, 256) +Encoder layer (1, 16, 16, 512) +Encoder layer final (1, 16, 16, 512) +Encoder layer final (1, 16, 16, 512) +Final embeddings are size (1, 16, 16, 16) +After quant (1, 16, 16, 16) +output example shape (4, 1, 16, 16, 16) +Test data shape (4, 256, 256, 3) +x shape (4, 1, 256, 256, 3) +encoded shape (4, 1, 16, 16, 16) +z_vectors shape (1, 16, 16, 16) +Decoder incoming shape (1, 16, 16, 16) +Decoder input (1, 16, 16, 512) +Mid Block Decoder layer (1, 16, 16, 512) +Mid Block Decoder layer (1, 16, 16, 512) +Decoder layer (1, 32, 32, 512) +Decoder layer (1, 64, 64, 256) +Decoder layer (1, 128, 128, 256) +Decoder layer (1, 256, 256, 128) +Decoder layer (1, 256, 256, 128) +image shape (4, 1, 256, 256, 3) +decoded img shape (256, 256, 3) +obs shape (4, 16, 16, 16) +DiT: Input of shape (4, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (4, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (4, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (1, 1152) dtype float32 + + DiT Summary  +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ path  ┃ module  ┃ inputs  ┃ outputs  ┃ params  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ │ DiT │ - float32[4,16,16,16] │ bfloat16[4,16,16,16] │ │ +│ │ │ - float32[1] │ │ │ +│ │ │ - float32[1] │ │ │ +│ │ │ - int32[1] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ PatchEmbed_0 │ PatchEmbed │ float32[4,16,16,16] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ PatchEmbed_0/Conv_0 │ Conv │ float32[4,16,16,16] │ bfloat16[4,16,16,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1,1,16,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 19,584 (78.3 KB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ TimestepEmbedder_0 │ TimestepEmbedder │ float32[1] │ float32[1,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ TimestepEmbedder_0/Dense_0 │ Dense │ bfloat16[1,256] │ bfloat16[1,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[256,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 296,064 (1.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ TimestepEmbedder_0/Dense_1 │ Dense │ bfloat16[1,1152] │ float32[1,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ TimestepEmbedder_1 │ TimestepEmbedder │ float32[1] │ float32[1,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ TimestepEmbedder_1/Dense_0 │ Dense │ bfloat16[1,256] │ bfloat16[1,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[256,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 296,064 (1.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ TimestepEmbedder_1/Dense_1 │ Dense │ bfloat16[1,1152] │ float32[1,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ LabelEmbedder_0 │ LabelEmbedder │ int32[1] │ bfloat16[1,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ LabelEmbedder_0/Embed_0 │ Embed │ int32[1] │ bfloat16[1,1152] │ embedding: float32[1001,1152] │ +│ │ │ │ │ ��� +│ │ │ │ │ 1,153,152 (4.6 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_0/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ ��� │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_1/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_2/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼─────────────────────���──┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_3/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_4/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────��───────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_5/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼─���─────────────────────────────┤ +│ DiTBlock_5/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_6/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼─────────────────────��──┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_7/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_8/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_9/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ �� │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_10/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├────────────────���─────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_11/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼───��────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_12/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼───────────���──────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_13/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼────────��──────────────────────┤ +│ DiTBlock_14/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_14/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────���───────────────┤ +│ DiTBlock_15/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├────────────────��─────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_15/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_16/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼───────────��──────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_17/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────���────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_18/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────��───────────────┤ +│ DiTBlock_19/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_19/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────���───────────────────┼───────────────────────────────┤ +│ DiTBlock_20/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_20/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_21/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/Dense_0 │ Dense ��� float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] �� +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_22/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼─────────���─────────────────────┤ +│ DiTBlock_23 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├─────────────────���────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_23/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────��───────────────────┼───────────────────────────────┤ +│ DiTBlock_24/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_24/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼───────────���────────────┼───────────────────────────────┤ +│ DiTBlock_25/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_25/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/Dense_1 │ Dense �� bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_26/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27 │ DiTBlock │ - bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,6912] │ bias: float32[6912] │ +│ │ │ │ │ kernel: float32[1152,6912] │ +│ │ │ │ │ │ +│ │ │ │ │ 7,969,536 (31.9 MB) │ +├──────────────────────────────────┼──────────────────┼──────��─────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/Dense_2 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/Dense_3 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/Dense_4 │ Dense │ float32[4,256,1152] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[1152,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,328,256 (5.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/LayerNorm_1 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/MlpBlock_0 │ MlpBlock │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/MlpBlock_0/Dense_0 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[1152,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,313,024 (21.3 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/MlpBlock_0/Dropout_0 │ Dropout │ bfloat16[4,256,4608] │ bfloat16[4,256,4608] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/MlpBlock_0/Dense_1 │ Dense │ bfloat16[4,256,4608] │ bfloat16[4,256,1152] │ bias: float32[1152] │ +│ │ │ │ │ kernel: float32[4608,1152] │ +│ │ │ │ │ │ +│ │ │ │ │ 5,309,568 (21.2 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ DiTBlock_27/MlpBlock_0/Dropout_1 │ Dropout │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ FinalLayer_0 │ FinalLayer │ - bfloat16[4,256,1152] │ bfloat16[4,256,16] │ │ +│ │ │ - float32[1,1152] │ │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ FinalLayer_0/Dense_0 │ Dense │ float32[1,1152] │ bfloat16[1,2304] │ bias: float32[2304] │ +│ │ │ │ │ kernel: float32[1152,2304] │ +│ │ │ │ │ │ +│ │ │ │ │ 2,656,512 (10.6 MB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ FinalLayer_0/LayerNorm_0 │ LayerNorm │ bfloat16[4,256,1152] │ bfloat16[4,256,1152] │ │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ FinalLayer_0/Dense_1 │ Dense │ bfloat16[4,256,1152] │ bfloat16[4,256,16] │ bias: float32[16] │ +│ │ │ │ │ kernel: float32[1152,16] │ +│ │ │ │ │ │ +│ │ │ │ │ 18,448 (73.8 KB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│ Embed_0 │ Embed │ int32[1] │ float32[1,1] │ embedding: float32[256,1] │ +│ │ │ │ │ │ +│ │ │ │ │ 256 (1.0 KB) │ +├──────────────────────────────────┼──────────────────┼────────────────────────┼────────────────────────┼───────────────────────────────┤ +│   │   │   │  Total │ 676,440,848 (2.7 GB)  │ +└──────────────────────────────────┴──────────────────┴────────────────────────┴────────────────────────┴───────────────────────────────┘ +  + Total Parameters: 676,440,848 (2.7 GB)  + + +DiT: Input of shape (4, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (4, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (4, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (1, 1152) dtype float32 +dict_keys(['PatchEmbed_0', 'TimestepEmbedder_0', 'TimestepEmbedder_1', 'LabelEmbedder_0', 'DiTBlock_0', 'DiTBlock_1', 'DiTBlock_2', 'DiTBlock_3', 'DiTBlock_4', 'DiTBlock_5', 'DiTBlock_6', 'DiTBlock_7', 'DiTBlock_8', 'DiTBlock_9', 'DiTBlock_10', 'DiTBlock_11', 'DiTBlock_12', 'DiTBlock_13', 'DiTBlock_14', 'DiTBlock_15', 'DiTBlock_16', 'DiTBlock_17', 'DiTBlock_18', 'DiTBlock_19', 'DiTBlock_20', 'DiTBlock_21', 'DiTBlock_22', 'DiTBlock_23', 'DiTBlock_24', 'DiTBlock_25', 'DiTBlock_26', 'DiTBlock_27', 'FinalLayer_0', 'Embed_0']) +Loaded checkpoint from 353140 seconds ago. + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_16', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_16', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_16', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_17', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_17', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_17', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_18', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_18', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_18', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_19', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_19', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_19', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_20', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_20', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_20', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_21', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_21', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_21', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_22', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_22', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_22', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_23', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_23', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_23', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_24', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_24', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_24', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_25', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_25', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_25', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_26', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_26', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_26', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_27', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_27', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_27', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 2304) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 1152, 2304) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 1152, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 1152) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 1152) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 1, 1, 16, 1152) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 1152, 1152) + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_16', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_16', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_16', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_17', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_17', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_17', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_18', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_18', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_18', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_19', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_19', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_19', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_20', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_20', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_20', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_21', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_21', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_21', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_22', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_22', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_22', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_23', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_23', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_23', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_24', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_24', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_24', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_25', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_25', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_25', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_26', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_26', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_26', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_27', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_27', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_27', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 2304) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 1152, 2304) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 1152, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 1152) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 1152) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 1, 1, 16, 1152) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +returning this +sharding NamedSharding(mesh=Mesh('devices': 4, axis_types=(Auto,)), spec=PartitionSpec(), memory_kind=device) +Loaded checkpoint from 353144 seconds ago. + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_16', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_16', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_16', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_17', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_17', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_17', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_18', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_18', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_18', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_19', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_19', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_19', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_20', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_20', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_20', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_21', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_21', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_21', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_22', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_22', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_22', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_23', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_23', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_23', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_24', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_24', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_24', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_25', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_25', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_25', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_26', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_26', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_26', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_27', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_27', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_27', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 2304) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 1152, 2304) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 1152, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 1152) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 1152) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 1, 1, 16, 1152) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +DiT: Input of shape (4, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (4, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (4, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (1, 1152) dtype float32 +dict_keys(['PatchEmbed_0', 'TimestepEmbedder_0', 'TimestepEmbedder_1', 'LabelEmbedder_0', 'DiTBlock_0', 'DiTBlock_1', 'DiTBlock_2', 'DiTBlock_3', 'DiTBlock_4', 'DiTBlock_5', 'DiTBlock_6', 'DiTBlock_7', 'DiTBlock_8', 'DiTBlock_9', 'DiTBlock_10', 'DiTBlock_11', 'DiTBlock_12', 'DiTBlock_13', 'DiTBlock_14', 'DiTBlock_15', 'DiTBlock_16', 'DiTBlock_17', 'DiTBlock_18', 'DiTBlock_19', 'DiTBlock_20', 'DiTBlock_21', 'DiTBlock_22', 'DiTBlock_23', 'DiTBlock_24', 'DiTBlock_25', 'DiTBlock_26', 'DiTBlock_27', 'FinalLayer_0', 'Embed_0']) +Loaded checkpoint from 353169 seconds ago. + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_16', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_16', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_16', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_17', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_17', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_17', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_18', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_18', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_18', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_19', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_19', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_19', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_20', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_20', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_20', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_21', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_21', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_21', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_22', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_22', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_22', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_23', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_23', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_23', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_24', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_24', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_24', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_25', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_25', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_25', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_26', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_26', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_26', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_27', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_27', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_27', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 2304) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 1152, 2304) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 1152, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 1152) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 1152) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 1, 1, 16, 1152) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 1152, 1152) + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_0', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_1', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_10', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_11', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_12', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_13', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_14', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_15', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_16', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_16', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_16', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_16', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_16', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_17', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_17', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_17', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_17', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_17', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_18', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_18', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_18', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_18', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_18', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_19', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_19', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_19', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_19', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_19', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_2', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_20', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_20', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_20', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_20', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_20', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_21', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_21', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_21', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_21', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_21', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_22', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_22', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_22', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_22', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_22', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_23', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_23', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_23', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_23', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_23', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_24', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_24', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_24', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_24', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_24', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_25', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_25', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_25', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_25', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_25', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_26', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_26', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_26', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_26', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_26', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_27', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_27', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_27', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_27', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_27', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_3', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_4', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_5', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_6', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_7', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_8', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 6912) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 1152, 6912) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 1152) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 1152, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_0', 'kernel'): (1, 1152, 4608) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'bias'): (1, 1152) +('DiTBlock_9', 'MlpBlock_0', 'Dense_1', 'kernel'): (1, 4608, 1152) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 2304) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 1152, 2304) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 1152, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 1152) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 1152) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 1, 1, 16, 1152) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 1152, 1152) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 1152) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 1152) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 1152, 1152) +returning this +┌────────────────────────────────────────────────┐ +│ │ +│ │ +│ │ +│ │ +│ TPU 0,1,2,3 │ +│ │ +│ │ +│ │ +│ │ +└────────────────────────────────────────────────┘ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ │ +│ │ +│ │ +│ │ +│ TPU 0,1,2,3 │ +│ │ +│ │ +│ │ +│ │ +└──────────────────────────────────────────────────────────────────────────────┘ +doing the else +(512, 256, 256, 3) +encode image shape (128, 256, 256, 3) +Initializing encoder. +Incoming encoder shape (128, 256, 256, 3) +Encoder layer (128, 256, 256, 128) +doing downsample +Encoder layer (128, 128, 128, 128) +doing downsample +Encoder layer (128, 64, 64, 128) +doing downsample +Encoder layer (128, 32, 32, 256) +doing downsample +Encoder layer (128, 16, 16, 256) +Encoder layer (128, 16, 16, 512) +Encoder layer final (128, 16, 16, 512) +Encoder layer final (128, 16, 16, 512) +Final embeddings are size (128, 16, 16, 16) +After quant (128, 16, 16, 16) +Calc FID for CFG 1.0 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +z_vectors shape (128, 16, 16, 16) +Decoder incoming shape (128, 16, 16, 16) +Decoder input (128, 16, 16, 512) +Mid Block Decoder layer (128, 16, 16, 512) +Mid Block Decoder layer (128, 16, 16, 512) +Decoder layer (128, 32, 32, 512) +Decoder layer (128, 64, 64, 256) +Decoder layer (128, 128, 128, 256) +Decoder layer (128, 256, 256, 128) +Decoder layer (128, 256, 256, 128) +FID is 12.178564071655273 +(512, 256, 256, 3) +Calc FID for CFG 1.0 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 12.413867950439453 +(512, 256, 256, 3) +Calc FID for CFG 1.0 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 13.017363548278809 +(512, 256, 256, 3) +Calc FID for CFG 1.0 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 14.35436725616455 +(512, 256, 256, 3) +Calc FID for CFG 1.0 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 17.493553161621094 +(512, 256, 256, 3) +Calc FID for CFG 1.0 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 26.160324096679688 +(512, 256, 256, 3) +Calc FID for CFG 1.0 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 79.53157806396484 +(512, 256, 256, 3) +Calc FID for CFG 1.0 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 217.19857788085938 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 5.484971046447754 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 5.631433010101318 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 5.9484429359436035 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 6.733639240264893 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 8.813756942749023 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 15.148361206054688 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 60.11519241333008 +(512, 256, 256, 3) +Calc FID for CFG 1.25 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 209.28390502929688 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 4.336448669433594 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 4.419934272766113 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 4.584867477416992 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 5.025850296020508 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 6.4045891761779785 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 11.002508163452148 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 50.12214660644531 +(512, 256, 256, 3) +Calc FID for CFG 1.5 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 203.58163452148438 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 5.656896591186523 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 5.703670978546143 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 5.783472061157227 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 6.0075483322143555 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 6.839598655700684 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 10.15201187133789 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 46.70877456665039 +(512, 256, 256, 3) +Calc FID for CFG 1.75 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 199.24395751953125 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 7.768148899078369 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 7.769139289855957 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 7.827305793762207 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 7.977083683013916 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 8.471925735473633 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 10.749156951904297 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 47.24931335449219 +(512, 256, 256, 3) +Calc FID for CFG 2.0 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 196.2218017578125 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 10.079756736755371 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 10.092052459716797 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 10.127752304077148 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 10.185007095336914 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 10.49016284942627 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 11.903386116027832 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 50.11427307128906 +(512, 256, 256, 3) +Calc FID for CFG 2.25 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 194.159912109375 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 12.28653335571289 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 12.311491012573242 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 12.323273658752441 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 12.371776580810547 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 12.470663070678711 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 13.184517860412598 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 54.432350158691406 +(512, 256, 256, 3) +Calc FID for CFG 2.5 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 192.67124938964844 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 14.346757888793945 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 14.351615905761719 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 14.364439010620117 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 14.348872184753418 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 14.292051315307617 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 14.401762008666992 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 60.11811447143555 +(512, 256, 256, 3) +Calc FID for CFG 2.75 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 191.72772216796875 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 128 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 16.10295867919922 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 64 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 16.135299682617188 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 32 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 16.198402404785156 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 16 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 16.09437370300293 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 8 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 15.910114288330078 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 4 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 15.408266067504883 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 2 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 67.09783935546875 +(512, 256, 256, 3) +Calc FID for CFG 3.0 and denoise_timesteps 1 +DiT: Input of shape (512, 16, 16, 16) dtype float32 +DiT: After patch embed, shape is (512, 256, 1152) dtype bfloat16 +DiT: Patch Embed of shape (512, 256, 1152) dtype bfloat16 +DiT: Conditioning of shape (512, 1152) dtype float32 +FID is 191.10202026367188 +wandb: +wandb: 🚀 View run shortcut_imagenet256 at: https://wandb.ai/daniel-z-kaplan/shortcut/runs/shortcut_imagenet256_20251008_211652_345353_10 +wandb: Find logs at: ../../../tmp/tmpqm3credt/wandb/run-20251008_211652-shortcut_imagenet256_20251008_211652_345353_10/logs