| { | |
| "cfg": { | |
| "amp": false, | |
| "chunk_size": -1, | |
| "flip_aug": false, | |
| "long_term": { | |
| "buffer_tokens": 2000, | |
| "count_usage": true, | |
| "max_mem_frames": 10, | |
| "max_num_tokens": 10000, | |
| "min_mem_frames": 5, | |
| "num_prototypes": 128 | |
| }, | |
| "max_internal_size": -1, | |
| "max_mem_frames": 5, | |
| "mem_every": 5, | |
| "model": { | |
| "aux_loss": { | |
| "query": { | |
| "enabled": true, | |
| "weight": 0.01 | |
| }, | |
| "sensory": { | |
| "enabled": true, | |
| "weight": 0.01 | |
| } | |
| }, | |
| "embed_dim": 256, | |
| "key_dim": 64, | |
| "mask_decoder": { | |
| "up_dims": [ | |
| 256, | |
| 128, | |
| 128, | |
| 64, | |
| 16 | |
| ] | |
| }, | |
| "mask_encoder": { | |
| "final_dim": 256, | |
| "type": "resnet18" | |
| }, | |
| "object_summarizer": { | |
| "add_pe": true, | |
| "embed_dim": "${model.object_transformer.embed_dim}", | |
| "num_summaries": "${model.object_transformer.num_queries}" | |
| }, | |
| "object_transformer": { | |
| "embed_dim": "${model.embed_dim}", | |
| "ff_dim": 2048, | |
| "num_blocks": 3, | |
| "num_heads": 8, | |
| "num_queries": 16, | |
| "pixel_self_attention": { | |
| "add_pe_to_qkv": [ | |
| true, | |
| true, | |
| false | |
| ] | |
| }, | |
| "query_self_attention": { | |
| "add_pe_to_qkv": [ | |
| true, | |
| true, | |
| false | |
| ] | |
| }, | |
| "read_from_memory": { | |
| "add_pe_to_qkv": [ | |
| true, | |
| true, | |
| false | |
| ] | |
| }, | |
| "read_from_past": { | |
| "add_pe_to_qkv": [ | |
| true, | |
| true, | |
| false | |
| ] | |
| }, | |
| "read_from_pixel": { | |
| "add_pe_to_qkv": [ | |
| true, | |
| true, | |
| false | |
| ], | |
| "input_add_pe": false, | |
| "input_norm": false | |
| }, | |
| "read_from_query": { | |
| "add_pe_to_qkv": [ | |
| true, | |
| true, | |
| false | |
| ], | |
| "output_norm": false | |
| } | |
| }, | |
| "pixel_dim": 256, | |
| "pixel_encoder": { | |
| "ms_dims": [ | |
| 1024, | |
| 512, | |
| 256, | |
| 64, | |
| 3 | |
| ], | |
| "type": "resnet50" | |
| }, | |
| "pixel_mean": [ | |
| 0.485, | |
| 0.456, | |
| 0.406 | |
| ], | |
| "pixel_pe_scale": 32, | |
| "pixel_pe_temperature": 128, | |
| "pixel_std": [ | |
| 0.229, | |
| 0.224, | |
| 0.225 | |
| ], | |
| "pretrained_resnet": false, | |
| "sensory_dim": 256, | |
| "value_dim": 256 | |
| }, | |
| "output_dir": null, | |
| "save_all": true, | |
| "save_aux": false, | |
| "save_scores": false, | |
| "stagger_updates": 5, | |
| "top_k": 30, | |
| "use_all_masks": false, | |
| "use_long_term": false, | |
| "visualize": false, | |
| "weights": "pretrained_models/matanyone.pth" | |
| }, | |
| "single_object": true | |
| } |