{ "format": "split", "components": [ "transformer", "text_encoder", "vae" ], "source": "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP", "notes": { "inpainting": "Transformer patch_embed.proj.weight has 48 input channels (16 latent + 16 masked latent + 16 mask) for inpainting support.", "text_encoder": "T5-v1.1-XXL encoder (24 layers, d_model=4096)." } }