{ "format": "split", "components": [ "transformer", "text_encoder", "vae" ], "source": "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP", "notes": { "inpainting": "Transformer patch_embed.proj is a Linear with in_dim=264 (in_channels=33 [16 latent + 16 masked + 1 mask] * patch_volume=8).", "text_encoder": "T5-v1.1-XXL encoder (24 layers, d_model=4096)." }, "quantized": true, "quantization_bits": 4 }