{ "cross_attention_dim": 1024, "in_channels": 64, "inflation_end": -1, "inflation_start": 0, "mlp_ratio": 4.0, "num_attention_heads": 16, "num_layers": 21, "num_tokens_nominal": 2048, "temporal_context_size": 16, "width": 2048 }