| { | |
| "cfg": { | |
| "decoder": { | |
| "d_model": 768, | |
| "freeze_text_encoder": true, | |
| "input_feat_dim": 256, | |
| "num_decoder_layers": 8, | |
| "num_queries": 256, | |
| "text_encoder": "clip", | |
| "transformer_dim_feedforward": 3072, | |
| "transformer_dropout": 0.1, | |
| "transformer_max_drop_path": 0.0, | |
| "transformer_n_heads": 12, | |
| "transformer_use_checkpointing": true | |
| }, | |
| "encoder": { | |
| "embed_dim": 256, | |
| "input_feat_dim": 1536, | |
| "num_rgb_harmonic_functions": 16, | |
| "ptv3_args": { | |
| "dec_channels": [ | |
| 256, | |
| 256, | |
| 512, | |
| 512 | |
| ], | |
| "enc_channels": [ | |
| 256, | |
| 256, | |
| 512, | |
| 512, | |
| 1024 | |
| ], | |
| "in_channels": 256 | |
| }, | |
| "rgb_proj_dim": 256 | |
| } | |
| } | |
| } |