diff --git "a/decoupled/decoupled.txt" "b/decoupled/decoupled.txt" new file mode 100644--- /dev/null +++ "b/decoupled/decoupled.txt" @@ -0,0 +1,3321 @@ +Using devices [TpuDevice(id=0, process_index=0, coords=(0,0,0), core_on_chip=0), TpuDevice(id=1, process_index=0, coords=(1,0,0), core_on_chip=0), TpuDevice(id=2, process_index=0, coords=(0,1,0), core_on_chip=0), TpuDevice(id=3, process_index=0, coords=(1,1,0), core_on_chip=0)] +Device count 4 +Global device count 4 +Global Batch: 256 +Node Batch: 256 +Device Batch: 64 +Loading dataset +Loading dataset +DiT: Input of shape (1, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (1, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (1, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (1, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (1, 256, 768) +(1, 768) +(1, 768) +(1, 256, 768) +(1, 768) +(1, 256, 768) + + DiT Summary  +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ path  ┃ module  ┃ inputs  ┃ outputs  ┃ params  ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ │ DiT │ - float32[1,32,32,4] │ bfloat16[1,32,32,4] │ │ +│ │ │ - float32[1] │ │ │ +│ │ │ - float32[1] │ │ │ +│ │ │ - int32[1] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ PatchEmbed_0 │ PatchEmbed │ float32[1,32,32,4] │ bfloat16[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ PatchEmbed_0/Conv_0 │ Conv │ float32[1,32,32,4] │ bfloat16[1,16,16,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2,2,4,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 13,056 (52.2 KB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ PatchEmbed_1 │ PatchEmbed │ float32[1,32,32,4] │ bfloat16[1,256,768] │ │ +├──────────────────────────────���──┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ PatchEmbed_1/Conv_0 │ Conv │ float32[1,32,32,4] │ bfloat16[1,16,16,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2,2,4,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 13,056 (52.2 KB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ TimestepEmbedder_0 │ TimestepEmbedder │ float32[1] │ float32[1,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ TimestepEmbedder_0/Dense_0 │ Dense │ bfloat16[1,256] │ bfloat16[1,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[256,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 197,376 (789.5 KB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ TimestepEmbedder_0/Dense_1 │ Dense │ bfloat16[1,768] │ float32[1,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ TimestepEmbedder_1 │ TimestepEmbedder │ float32[1] │ float32[1,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ TimestepEmbedder_1/Dense_0 │ Dense │ bfloat16[1,256] │ bfloat16[1,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[256,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 197,376 (789.5 KB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ TimestepEmbedder_1/Dense_1 │ Dense │ bfloat16[1,768] │ float32[1,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ LabelEmbedder_0 │ LabelEmbedder │ int32[1] │ bfloat16[1,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ LabelEmbedder_0/Embed_0 │ Embed │ int32[1] │ bfloat16[1,768] │ embedding: float32[1001,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 768,768 (3.1 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0 │ DiTBlock │ - bfloat16[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0/Dense_0 │ Dense │ float32[1,768] │ bfloat16[1,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼──────────���────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ VisionRotaryEmbeddingFast_0 │ VisionRotaryEmbeddingFast │ bfloat16[1,256,12,64] │ float32[1,256,12,64] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────��──────────────────────────────┤ +│ DiTBlock_0/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_0/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/Dense_0 │ Dense │ float32[1,768] │ bfloat16[1,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_1/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/Dense_0 │ Dense │ float32[1,768] │ bfloat16[1,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_2/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/Dense_0 │ Dense │ float32[1,768] │ bfloat16[1,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_3/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4 │ DiTBlock │ - bfloat16[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼────────────────────���──┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_4/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_5 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_5/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼────────────────���──────┼──────────────────────────────┤ +│ DiTBlock_5/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_5/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_5/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_5/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_5/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼─────────────────────────────��┤ +│ DiTBlock_5/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_5/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼────────────���─────────────────┤ +│ DiTBlock_6/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_6/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +�� DiTBlock_7/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_7/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_8/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_9/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├──���──────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_10/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_11/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├──────────────────────���──────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_12/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_13/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_14/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15 │ DiTBlock │ - float32[1,256,768] │ float32[1,256,768] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,4608] │ bias: float32[4608] │ +│ │ │ │ │ kernel: float32[768,4608] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,543,552 (14.2 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/Dense_2 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/Dense_3 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/Dense_4 │ Dense │ float32[1,256,768] │ bfloat16[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[768,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 590,592 (2.4 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/SwiGLUFFN_0 │ SwiGLUFFN │ bfloat16[1,256,768] │ float32[1,256,768] │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/SwiGLUFFN_0/Dense_0 │ Dense │ bfloat16[1,256,768] │ float32[1,256,4096] │ bias: float32[4096] │ +│ │ │ │ │ kernel: float32[768,4096] │ +│ │ │ │ │ │ +│ │ │ │ │ 3,149,824 (12.6 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ DiTBlock_15/SwiGLUFFN_0/Dense_1 │ Dense │ float32[1,256,2048] │ float32[1,256,768] │ bias: float32[768] │ +│ │ │ │ │ kernel: float32[2048,768] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,573,632 (6.3 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ FinalLayer_0 │ FinalLayer │ - float32[1,256,768] │ bfloat16[1,256,16] │ │ +│ │ │ - float32[1,256,768] │ │ │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ FinalLayer_0/Dense_0 │ Dense │ float32[1,256,768] │ bfloat16[1,256,1536] │ bias: float32[1536] │ +│ │ │ │ │ kernel: float32[768,1536] │ +│ │ │ │ │ │ +│ │ │ │ │ 1,181,184 (4.7 MB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ FinalLayer_0/Dense_1 │ Dense │ bfloat16[1,256,768] │ bfloat16[1,256,16] │ bias: float32[16] │ +│ │ │ │ │ kernel: float32[768,16] │ +│ │ │ │ │ │ +│ │ │ │ │ 12,304 (49.2 KB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│ Embed_0 │ Embed │ int32[1] │ float32[1,1] │ embedding: float32[256,1] │ +│ │ │ │ │ │ +│ │ │ │ │ 256 (1.0 KB) │ +├─────────────────────────────────┼───────────────────────────┼───────────────────────┼───────────────────────┼──────────────────────────────┤ +│   │   │   │  Total │ 173,634,576 (694.5 MB)  │ +└─────────────────────────────────┴───────────────────────────┴───────────────────────┴───────────────────────┴──────────────────────────────┘ +  + Total Parameters: 173,634,576 (694.5 MB)  + + +DiT: Input of shape (1, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (1, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (1, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (1, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (1, 256, 768) +(1, 768) +(1, 768) +(1, 256, 768) +(1, 768) +(1, 256, 768) +Loaded checkpoint from 26446 seconds ago. + + parameter shapes: +('PatchEmbed_0', 'Conv_0', 'kernel'): (2, 2, 4, 768) +('PatchEmbed_0', 'Conv_0', 'bias'): (768,) +('PatchEmbed_1', 'Conv_0', 'kernel'): (2, 2, 4, 768) +('PatchEmbed_1', 'Conv_0', 'bias'): (768,) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (256, 768) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (768,) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (768, 768) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (768,) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (256, 768) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (768,) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (768, 768) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (768,) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1001, 768) +('DiTBlock_0', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_0', 'Dense_0', 'bias'): (4608,) +('DiTBlock_0', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_0', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_0', 'Dense_2', 'bias'): (768,) +('DiTBlock_0', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_0', 'Dense_3', 'bias'): (768,) +('DiTBlock_0', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_0', 'Dense_4', 'bias'): (768,) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_1', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_1', 'Dense_0', 'bias'): (4608,) +('DiTBlock_1', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_1', 'Dense_1', 'bias'): (768,) +('DiTBlock_1', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_1', 'Dense_2', 'bias'): (768,) +('DiTBlock_1', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_1', 'Dense_3', 'bias'): (768,) +('DiTBlock_1', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_1', 'Dense_4', 'bias'): (768,) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_2', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_2', 'Dense_0', 'bias'): (4608,) +('DiTBlock_2', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_2', 'Dense_1', 'bias'): (768,) +('DiTBlock_2', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_2', 'Dense_2', 'bias'): (768,) +('DiTBlock_2', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_2', 'Dense_3', 'bias'): (768,) +('DiTBlock_2', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_2', 'Dense_4', 'bias'): (768,) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_3', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_3', 'Dense_0', 'bias'): (4608,) +('DiTBlock_3', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_3', 'Dense_1', 'bias'): (768,) +('DiTBlock_3', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_3', 'Dense_2', 'bias'): (768,) +('DiTBlock_3', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_3', 'Dense_3', 'bias'): (768,) +('DiTBlock_3', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_3', 'Dense_4', 'bias'): (768,) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_4', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_4', 'Dense_0', 'bias'): (4608,) +('DiTBlock_4', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_4', 'Dense_1', 'bias'): (768,) +('DiTBlock_4', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_4', 'Dense_2', 'bias'): (768,) +('DiTBlock_4', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_4', 'Dense_3', 'bias'): (768,) +('DiTBlock_4', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_4', 'Dense_4', 'bias'): (768,) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_5', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_5', 'Dense_0', 'bias'): (4608,) +('DiTBlock_5', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_5', 'Dense_1', 'bias'): (768,) +('DiTBlock_5', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_5', 'Dense_2', 'bias'): (768,) +('DiTBlock_5', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_5', 'Dense_3', 'bias'): (768,) +('DiTBlock_5', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_5', 'Dense_4', 'bias'): (768,) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_6', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_6', 'Dense_0', 'bias'): (4608,) +('DiTBlock_6', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_6', 'Dense_1', 'bias'): (768,) +('DiTBlock_6', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_6', 'Dense_2', 'bias'): (768,) +('DiTBlock_6', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_6', 'Dense_3', 'bias'): (768,) +('DiTBlock_6', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_6', 'Dense_4', 'bias'): (768,) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_7', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_7', 'Dense_0', 'bias'): (4608,) +('DiTBlock_7', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_7', 'Dense_1', 'bias'): (768,) +('DiTBlock_7', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_7', 'Dense_2', 'bias'): (768,) +('DiTBlock_7', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_7', 'Dense_3', 'bias'): (768,) +('DiTBlock_7', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_7', 'Dense_4', 'bias'): (768,) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_8', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_8', 'Dense_0', 'bias'): (4608,) +('DiTBlock_8', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_8', 'Dense_1', 'bias'): (768,) +('DiTBlock_8', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_8', 'Dense_2', 'bias'): (768,) +('DiTBlock_8', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_8', 'Dense_3', 'bias'): (768,) +('DiTBlock_8', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_8', 'Dense_4', 'bias'): (768,) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_9', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_9', 'Dense_0', 'bias'): (4608,) +('DiTBlock_9', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_9', 'Dense_1', 'bias'): (768,) +('DiTBlock_9', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_9', 'Dense_2', 'bias'): (768,) +('DiTBlock_9', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_9', 'Dense_3', 'bias'): (768,) +('DiTBlock_9', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_9', 'Dense_4', 'bias'): (768,) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_10', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_10', 'Dense_0', 'bias'): (4608,) +('DiTBlock_10', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_10', 'Dense_1', 'bias'): (768,) +('DiTBlock_10', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_10', 'Dense_2', 'bias'): (768,) +('DiTBlock_10', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_10', 'Dense_3', 'bias'): (768,) +('DiTBlock_10', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_10', 'Dense_4', 'bias'): (768,) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_11', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_11', 'Dense_0', 'bias'): (4608,) +('DiTBlock_11', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_11', 'Dense_1', 'bias'): (768,) +('DiTBlock_11', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_11', 'Dense_2', 'bias'): (768,) +('DiTBlock_11', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_11', 'Dense_3', 'bias'): (768,) +('DiTBlock_11', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_11', 'Dense_4', 'bias'): (768,) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_12', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_12', 'Dense_0', 'bias'): (4608,) +('DiTBlock_12', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_12', 'Dense_1', 'bias'): (768,) +('DiTBlock_12', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_12', 'Dense_2', 'bias'): (768,) +('DiTBlock_12', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_12', 'Dense_3', 'bias'): (768,) +('DiTBlock_12', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_12', 'Dense_4', 'bias'): (768,) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_13', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_13', 'Dense_0', 'bias'): (4608,) +('DiTBlock_13', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_13', 'Dense_1', 'bias'): (768,) +('DiTBlock_13', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_13', 'Dense_2', 'bias'): (768,) +('DiTBlock_13', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_13', 'Dense_3', 'bias'): (768,) +('DiTBlock_13', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_13', 'Dense_4', 'bias'): (768,) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_14', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_14', 'Dense_0', 'bias'): (4608,) +('DiTBlock_14', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_14', 'Dense_1', 'bias'): (768,) +('DiTBlock_14', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_14', 'Dense_2', 'bias'): (768,) +('DiTBlock_14', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_14', 'Dense_3', 'bias'): (768,) +('DiTBlock_14', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_14', 'Dense_4', 'bias'): (768,) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_15', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_15', 'Dense_0', 'bias'): (4608,) +('DiTBlock_15', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_15', 'Dense_1', 'bias'): (768,) +('DiTBlock_15', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_15', 'Dense_2', 'bias'): (768,) +('DiTBlock_15', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_15', 'Dense_3', 'bias'): (768,) +('DiTBlock_15', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_15', 'Dense_4', 'bias'): (768,) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('FinalLayer_0', 'Dense_0', 'kernel'): (768, 1536) +('FinalLayer_0', 'Dense_0', 'bias'): (1536,) +('FinalLayer_0', 'Dense_1', 'kernel'): (768, 16) +('FinalLayer_0', 'Dense_1', 'bias'): (16,) +('Embed_0', 'embedding'): (256, 1) + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 1536) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 768, 1536) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 768, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 768) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('PatchEmbed_1', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_1', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 768, 768) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 768, 768) + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 1536) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 768, 1536) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 768, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 768) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('PatchEmbed_1', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_1', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 768, 768) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 768, 768) + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 1536) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 768, 1536) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 768, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 768) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('PatchEmbed_1', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_1', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 768, 768) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 768, 768) + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_0', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_0', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_1', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_1', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_1', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_1', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_10', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_10', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_10', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_10', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_11', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_11', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_11', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_11', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_12', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_12', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_12', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_12', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_13', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_13', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_13', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_13', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_14', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_14', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_14', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_14', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_15', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_15', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_15', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_15', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_2', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_2', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_2', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_2', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_3', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_3', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_3', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_3', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_4', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_4', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_4', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_4', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_5', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_5', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_5', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_5', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_6', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_6', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_6', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_6', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_7', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_7', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_7', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_7', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_8', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_8', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_8', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_8', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('DiTBlock_9', 'Dense_0', 'bias'): (1, 4608) +('DiTBlock_9', 'Dense_0', 'kernel'): (1, 768, 4608) +('DiTBlock_9', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_1', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_2', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_2', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_3', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_3', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'Dense_4', 'bias'): (1, 768) +('DiTBlock_9', 'Dense_4', 'kernel'): (1, 768, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (1, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (1, 768, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (1, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (1, 2048, 768) +('Embed_0', 'embedding'): (1, 256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1, 1536) +('FinalLayer_0', 'Dense_0', 'kernel'): (1, 768, 1536) +('FinalLayer_0', 'Dense_1', 'bias'): (1, 16) +('FinalLayer_0', 'Dense_1', 'kernel'): (1, 768, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1, 1001, 768) +('PatchEmbed_0', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_0', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('PatchEmbed_1', 'Conv_0', 'bias'): (1, 768) +('PatchEmbed_1', 'Conv_0', 'kernel'): (1, 2, 2, 4, 768) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (1, 768, 768) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (1, 256, 768) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (1, 768) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (1, 768, 768) + + parameter shapes: +('DiTBlock_0', 'Dense_0', 'bias'): (4608,) +('DiTBlock_0', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_0', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_0', 'Dense_2', 'bias'): (768,) +('DiTBlock_0', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_0', 'Dense_3', 'bias'): (768,) +('DiTBlock_0', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_0', 'Dense_4', 'bias'): (768,) +('DiTBlock_0', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_0', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_1', 'Dense_0', 'bias'): (4608,) +('DiTBlock_1', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_1', 'Dense_1', 'bias'): (768,) +('DiTBlock_1', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_1', 'Dense_2', 'bias'): (768,) +('DiTBlock_1', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_1', 'Dense_3', 'bias'): (768,) +('DiTBlock_1', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_1', 'Dense_4', 'bias'): (768,) +('DiTBlock_1', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_1', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_10', 'Dense_0', 'bias'): (4608,) +('DiTBlock_10', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_10', 'Dense_1', 'bias'): (768,) +('DiTBlock_10', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_10', 'Dense_2', 'bias'): (768,) +('DiTBlock_10', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_10', 'Dense_3', 'bias'): (768,) +('DiTBlock_10', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_10', 'Dense_4', 'bias'): (768,) +('DiTBlock_10', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_10', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_11', 'Dense_0', 'bias'): (4608,) +('DiTBlock_11', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_11', 'Dense_1', 'bias'): (768,) +('DiTBlock_11', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_11', 'Dense_2', 'bias'): (768,) +('DiTBlock_11', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_11', 'Dense_3', 'bias'): (768,) +('DiTBlock_11', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_11', 'Dense_4', 'bias'): (768,) +('DiTBlock_11', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_11', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_12', 'Dense_0', 'bias'): (4608,) +('DiTBlock_12', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_12', 'Dense_1', 'bias'): (768,) +('DiTBlock_12', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_12', 'Dense_2', 'bias'): (768,) +('DiTBlock_12', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_12', 'Dense_3', 'bias'): (768,) +('DiTBlock_12', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_12', 'Dense_4', 'bias'): (768,) +('DiTBlock_12', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_12', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_13', 'Dense_0', 'bias'): (4608,) +('DiTBlock_13', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_13', 'Dense_1', 'bias'): (768,) +('DiTBlock_13', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_13', 'Dense_2', 'bias'): (768,) +('DiTBlock_13', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_13', 'Dense_3', 'bias'): (768,) +('DiTBlock_13', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_13', 'Dense_4', 'bias'): (768,) +('DiTBlock_13', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_13', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_14', 'Dense_0', 'bias'): (4608,) +('DiTBlock_14', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_14', 'Dense_1', 'bias'): (768,) +('DiTBlock_14', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_14', 'Dense_2', 'bias'): (768,) +('DiTBlock_14', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_14', 'Dense_3', 'bias'): (768,) +('DiTBlock_14', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_14', 'Dense_4', 'bias'): (768,) +('DiTBlock_14', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_14', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_15', 'Dense_0', 'bias'): (4608,) +('DiTBlock_15', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_15', 'Dense_1', 'bias'): (768,) +('DiTBlock_15', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_15', 'Dense_2', 'bias'): (768,) +('DiTBlock_15', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_15', 'Dense_3', 'bias'): (768,) +('DiTBlock_15', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_15', 'Dense_4', 'bias'): (768,) +('DiTBlock_15', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_15', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_2', 'Dense_0', 'bias'): (4608,) +('DiTBlock_2', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_2', 'Dense_1', 'bias'): (768,) +('DiTBlock_2', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_2', 'Dense_2', 'bias'): (768,) +('DiTBlock_2', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_2', 'Dense_3', 'bias'): (768,) +('DiTBlock_2', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_2', 'Dense_4', 'bias'): (768,) +('DiTBlock_2', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_2', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_3', 'Dense_0', 'bias'): (4608,) +('DiTBlock_3', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_3', 'Dense_1', 'bias'): (768,) +('DiTBlock_3', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_3', 'Dense_2', 'bias'): (768,) +('DiTBlock_3', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_3', 'Dense_3', 'bias'): (768,) +('DiTBlock_3', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_3', 'Dense_4', 'bias'): (768,) +('DiTBlock_3', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_3', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_4', 'Dense_0', 'bias'): (4608,) +('DiTBlock_4', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_4', 'Dense_1', 'bias'): (768,) +('DiTBlock_4', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_4', 'Dense_2', 'bias'): (768,) +('DiTBlock_4', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_4', 'Dense_3', 'bias'): (768,) +('DiTBlock_4', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_4', 'Dense_4', 'bias'): (768,) +('DiTBlock_4', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_4', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_5', 'Dense_0', 'bias'): (4608,) +('DiTBlock_5', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_5', 'Dense_1', 'bias'): (768,) +('DiTBlock_5', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_5', 'Dense_2', 'bias'): (768,) +('DiTBlock_5', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_5', 'Dense_3', 'bias'): (768,) +('DiTBlock_5', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_5', 'Dense_4', 'bias'): (768,) +('DiTBlock_5', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_5', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_6', 'Dense_0', 'bias'): (4608,) +('DiTBlock_6', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_6', 'Dense_1', 'bias'): (768,) +('DiTBlock_6', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_6', 'Dense_2', 'bias'): (768,) +('DiTBlock_6', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_6', 'Dense_3', 'bias'): (768,) +('DiTBlock_6', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_6', 'Dense_4', 'bias'): (768,) +('DiTBlock_6', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_6', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_7', 'Dense_0', 'bias'): (4608,) +('DiTBlock_7', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_7', 'Dense_1', 'bias'): (768,) +('DiTBlock_7', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_7', 'Dense_2', 'bias'): (768,) +('DiTBlock_7', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_7', 'Dense_3', 'bias'): (768,) +('DiTBlock_7', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_7', 'Dense_4', 'bias'): (768,) +('DiTBlock_7', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_7', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_8', 'Dense_0', 'bias'): (4608,) +('DiTBlock_8', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_8', 'Dense_1', 'bias'): (768,) +('DiTBlock_8', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_8', 'Dense_2', 'bias'): (768,) +('DiTBlock_8', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_8', 'Dense_3', 'bias'): (768,) +('DiTBlock_8', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_8', 'Dense_4', 'bias'): (768,) +('DiTBlock_8', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_8', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('DiTBlock_9', 'Dense_0', 'bias'): (4608,) +('DiTBlock_9', 'Dense_0', 'kernel'): (768, 4608) +('DiTBlock_9', 'Dense_1', 'bias'): (768,) +('DiTBlock_9', 'Dense_1', 'kernel'): (768, 768) +('DiTBlock_9', 'Dense_2', 'bias'): (768,) +('DiTBlock_9', 'Dense_2', 'kernel'): (768, 768) +('DiTBlock_9', 'Dense_3', 'bias'): (768,) +('DiTBlock_9', 'Dense_3', 'kernel'): (768, 768) +('DiTBlock_9', 'Dense_4', 'bias'): (768,) +('DiTBlock_9', 'Dense_4', 'kernel'): (768, 768) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'bias'): (4096,) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_0', 'kernel'): (768, 4096) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'bias'): (768,) +('DiTBlock_9', 'SwiGLUFFN_0', 'Dense_1', 'kernel'): (2048, 768) +('Embed_0', 'embedding'): (256, 1) +('FinalLayer_0', 'Dense_0', 'bias'): (1536,) +('FinalLayer_0', 'Dense_0', 'kernel'): (768, 1536) +('FinalLayer_0', 'Dense_1', 'bias'): (16,) +('FinalLayer_0', 'Dense_1', 'kernel'): (768, 16) +('LabelEmbedder_0', 'Embed_0', 'embedding'): (1001, 768) +('PatchEmbed_0', 'Conv_0', 'bias'): (768,) +('PatchEmbed_0', 'Conv_0', 'kernel'): (2, 2, 4, 768) +('PatchEmbed_1', 'Conv_0', 'bias'): (768,) +('PatchEmbed_1', 'Conv_0', 'kernel'): (2, 2, 4, 768) +('TimestepEmbedder_0', 'Dense_0', 'bias'): (768,) +('TimestepEmbedder_0', 'Dense_0', 'kernel'): (256, 768) +('TimestepEmbedder_0', 'Dense_1', 'bias'): (768,) +('TimestepEmbedder_0', 'Dense_1', 'kernel'): (768, 768) +('TimestepEmbedder_1', 'Dense_0', 'bias'): (768,) +('TimestepEmbedder_1', 'Dense_0', 'kernel'): (256, 768) +('TimestepEmbedder_1', 'Dense_1', 'bias'): (768,) +('TimestepEmbedder_1', 'Dense_1', 'kernel'): (768, 768) +┌────────────────────────────────────────────────┐ +│ │ +│ │ +│ │ +│ │ +│ TPU 0,1,2,3 │ +│ │ +│ │ +│ │ +│ │ +└────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────────────┐ +│ │ +│ │ +│ │ +│ │ +│ TPU 0,1,2,3 │ +│ │ +│ │ +│ │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ +doing the else +Calc FID for CFG 1.0 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 19.844388961791992 +Calc FID for CFG 1.0 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 7.540131092071533 +Calc FID for CFG 1.0 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 7.326756000518799 +Calc FID for CFG 1.0 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 8.09870719909668 +Calc FID for CFG 1.0 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 8.908642768859863 +Calc FID for CFG 1.0 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.321769714355469 +Calc FID for CFG 1.0 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 13.177682876586914 +Calc FID for CFG 1.0 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 21.720718383789062 +Calc FID for CFG 1.25 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 9.465641021728516 +Calc FID for CFG 1.25 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 5.619251251220703 +Calc FID for CFG 1.25 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 6.133846759796143 +Calc FID for CFG 1.25 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 6.547901153564453 +Calc FID for CFG 1.25 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 7.239920616149902 +Calc FID for CFG 1.25 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 8.199488639831543 +Calc FID for CFG 1.25 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.293099403381348 +Calc FID for CFG 1.25 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 16.286109924316406 +Calc FID for CFG 1.5 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 5.479815483093262 +Calc FID for CFG 1.5 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 6.8870391845703125 +Calc FID for CFG 1.5 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 7.545630931854248 +Calc FID for CFG 1.5 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 7.620797157287598 +Calc FID for CFG 1.5 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 7.973172664642334 +Calc FID for CFG 1.5 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 8.499159812927246 +Calc FID for CFG 1.5 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.150247573852539 +Calc FID for CFG 1.5 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 16.650300979614258 +Calc FID for CFG 1.75 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 4.810676574707031 +Calc FID for CFG 1.75 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 8.879629135131836 +Calc FID for CFG 1.75 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 9.475668907165527 +Calc FID for CFG 1.75 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 9.26155948638916 +Calc FID for CFG 1.75 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 9.309438705444336 +Calc FID for CFG 1.75 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 9.46221923828125 +Calc FID for CFG 1.75 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.858394622802734 +Calc FID for CFG 1.75 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 19.826522827148438 +Calc FID for CFG 2.0 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 5.595621109008789 +Calc FID for CFG 2.0 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.847318649291992 +Calc FID for CFG 2.0 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 11.344917297363281 +Calc FID for CFG 2.0 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.920357704162598 +Calc FID for CFG 2.0 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.659997940063477 +Calc FID for CFG 2.0 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 10.584415435791016 +Calc FID for CFG 2.0 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 12.026324272155762 +Calc FID for CFG 2.0 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 24.808246612548828 +Calc FID for CFG 2.25 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 6.935703277587891 +Calc FID for CFG 2.25 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 12.604620933532715 +Calc FID for CFG 2.25 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 12.945205688476562 +Calc FID for CFG 2.25 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 12.379002571105957 +Calc FID for CFG 2.25 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 11.994935989379883 +Calc FID for CFG 2.25 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 11.68898868560791 +Calc FID for CFG 2.25 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 13.530972480773926 +Calc FID for CFG 2.25 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 30.642826080322266 +Calc FID for CFG 2.5 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 8.419413566589355 +Calc FID for CFG 2.5 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 14.017271041870117 +Calc FID for CFG 2.5 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 14.307957649230957 +Calc FID for CFG 2.5 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 13.642496109008789 +Calc FID for CFG 2.5 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 13.212810516357422 +Calc FID for CFG 2.5 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 12.848617553710938 +Calc FID for CFG 2.5 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 15.422527313232422 +Calc FID for CFG 2.5 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 36.63753890991211 +Calc FID for CFG 2.75 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 9.82433032989502 +Calc FID for CFG 2.75 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 15.356572151184082 +Calc FID for CFG 2.75 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 15.466590881347656 +Calc FID for CFG 2.75 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 14.721940040588379 +Calc FID for CFG 2.75 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 14.276111602783203 +Calc FID for CFG 2.75 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 14.007651329040527 +Calc FID for CFG 2.75 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 17.719642639160156 +Calc FID for CFG 2.75 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 42.4148063659668 +Calc FID for CFG 3.0 and denoise_timesteps 128 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 11.155534744262695 +Calc FID for CFG 3.0 and denoise_timesteps 64 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 16.455982208251953 +Calc FID for CFG 3.0 and denoise_timesteps 32 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 16.489774703979492 +Calc FID for CFG 3.0 and denoise_timesteps 16 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 15.718584060668945 +Calc FID for CFG 3.0 and denoise_timesteps 8 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 15.273191452026367 +Calc FID for CFG 3.0 and denoise_timesteps 4 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 15.260807991027832 +Calc FID for CFG 3.0 and denoise_timesteps 2 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 20.481718063354492 +Calc FID for CFG 3.0 and denoise_timesteps 1 +DiT: Input of shape (256, 32, 32, 4) dtype float32 +DiT: After patch embed, shape is (256, 256, 768) dtype bfloat16 +DiT: Patch Embed of shape (256, 256, 768) dtype bfloat16 +DiT: Conditioning of shape (256, 768) dtype float32 +selfh idden 768 +self heads 12 +hw_swq 16 +xshape (256, 256, 768) +(256, 768) +(256, 768) +(256, 256, 768) +(256, 768) +(256, 256, 768) +FID is 47.77876281738281