Spaces:
Runtime error
Runtime error
| # -------------------------------------------------------- | |
| # X-Decoder -- Generalized Decoding for Pixel, Image, and Language | |
| # Copyright (c) 2022 Microsoft | |
| # Licensed under The MIT License [see LICENSE for details] | |
| # Written by Xueyan Zou (xueyan@cs.wisc.edu) | |
| # -------------------------------------------------------- | |
| # Define Test/Trainer/Saving | |
| PIPELINE: XDecoderPipeline | |
| TRAINER: xdecoder | |
| SAVE_DIR: '../../data/output/test' | |
| base_path: "./" | |
| # Resume Logistic | |
| RESUME: false | |
| WEIGHT: false | |
| RESUME_FROM: '' | |
| EVAL_AT_START: False | |
| # Logging and Debug | |
| WANDB: False | |
| LOG_EVERY: 100 | |
| FIND_UNUSED_PARAMETERS: false | |
| # Speed up training | |
| FP16: false | |
| PORT: '36873' | |
| # misc | |
| LOADER: | |
| JOINT: False | |
| KEY_DATASET: 'coco' | |
| ################## | |
| # Task settings | |
| ################## | |
| VERBOSE: true | |
| MODEL: | |
| NAME: seem_model_v1 | |
| HEAD: xdecoder_head | |
| MASK_ON: false | |
| KEYPOINT_ON: false | |
| LOAD_PROPOSALS: false | |
| DIM_PROJ: 512 | |
| TEXT: | |
| ARCH: vlpencoder | |
| NAME: transformer | |
| TOKENIZER: clip | |
| CONTEXT_LENGTH: 77 # 77 | |
| WIDTH: 512 | |
| HEADS: 8 | |
| LAYERS: 12 # 6 | |
| AUTOGRESSIVE: True | |
| BACKBONE: | |
| NAME: focal | |
| PRETRAINED: '' | |
| LOAD_PRETRAINED: false | |
| FOCAL: | |
| PRETRAIN_IMG_SIZE: 224 | |
| PATCH_SIZE: 4 | |
| EMBED_DIM: 192 | |
| DEPTHS: [2, 2, 18, 2] | |
| FOCAL_LEVELS: [4, 4, 4, 4] | |
| FOCAL_WINDOWS: [3, 3, 3, 3] | |
| DROP_PATH_RATE: 0.3 | |
| MLP_RATIO: 4.0 | |
| DROP_RATE: 0.0 | |
| PATCH_NORM: True | |
| USE_CONV_EMBED: True | |
| SCALING_MODULATOR: True | |
| USE_CHECKPOINT: False | |
| USE_POSTLN: true | |
| USE_POSTLN_IN_MODULATION: false | |
| USE_LAYERSCALE: True | |
| OUT_FEATURES: ["res2", "res3", "res4", "res5"] | |
| OUT_INDICES: [0, 1, 2, 3] | |
| ENCODER: | |
| NAME: transformer_encoder_fpn | |
| IGNORE_VALUE: 255 | |
| NUM_CLASSES: 133 | |
| LOSS_WEIGHT: 1.0 | |
| CONVS_DIM: 512 | |
| MASK_DIM: 512 | |
| NORM: "GN" | |
| IN_FEATURES: ["res2", "res3", "res4", "res5"] | |
| DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] | |
| COMMON_STRIDE: 4 | |
| TRANSFORMER_ENC_LAYERS: 6 | |
| DECODER: | |
| NAME: seem_v1 | |
| TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" | |
| MASK: | |
| ENABLED: True | |
| DETECTION: False | |
| SPATIAL: | |
| ENABLED: True | |
| MAX_ITER: 1 | |
| GROUNDING: | |
| ENABLED: True | |
| MAX_LEN: 5 | |
| TEXT_WEIGHT: 2.0 | |
| CLASS_WEIGHT: 0.5 | |
| RETRIEVAL: | |
| ENABLED: False | |
| LVIS: | |
| ENABLED: True | |
| THRES: 0.7 | |
| OPENIMAGE: | |
| ENABLED: False | |
| NEGATIVE_SAMPLES: 5 | |
| GROUNDING: | |
| ENABLED: False | |
| MAX_LEN: 5 | |
| CAPTION: | |
| ENABLED: False | |
| PHRASE_PROB: 0.5 | |
| SIM_THRES: 0.95 | |
| DEEP_SUPERVISION: True | |
| NO_OBJECT_WEIGHT: 0.1 | |
| GCLASS_WEIGHT: 0.4 | |
| GMASK_WEIGHT: 1.0 | |
| GDICE_WEIGHT: 1.0 | |
| SCLASS_WEIGHT: 0.4 | |
| SMASK_WEIGHT: 1.0 | |
| SDICE_WEIGHT: 1.0 | |
| OCLASS_WEIGHT: 0.4 | |
| OMASK_WEIGHT: 1.0 | |
| ODICE_WEIGHT: 1.0 | |
| CLASS_WEIGHT: 2.0 | |
| MASK_WEIGHT: 5.0 | |
| DICE_WEIGHT: 5.0 | |
| BBOX_WEIGHT: 5.0 | |
| GIOU_WEIGHT: 2.0 | |
| CAPTION_WEIGHT: 2.0 | |
| COST_SPATIAL: | |
| CLASS_WEIGHT: 5.0 | |
| MASK_WEIGHT: 2.0 | |
| DICE_WEIGHT: 2.0 | |
| HIDDEN_DIM: 512 | |
| NUM_OBJECT_QUERIES: 101 | |
| NHEADS: 8 | |
| DROPOUT: 0.0 | |
| DIM_FEEDFORWARD: 2048 | |
| MAX_SPATIAL_LEN: [512, 512, 512, 512] | |
| # ENC_LAYERS: 0 | |
| PRE_NORM: False | |
| ENFORCE_INPUT_PROJ: False | |
| SIZE_DIVISIBILITY: 32 | |
| TRAIN_NUM_POINTS: 12544 | |
| OVERSAMPLE_RATIO: 3.0 | |
| IMPORTANCE_SAMPLE_RATIO: 0.75 | |
| DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query | |
| TOP_GROUNDING_LAYERS: 10 | |
| TOP_CAPTION_LAYERS: 10 | |
| TOP_SPATIAL_LAYERS: 10 | |
| TOP_OPENIMAGE_LAYERS: 10 | |
| TEST: | |
| SEMANTIC_ON: True | |
| INSTANCE_ON: True | |
| PANOPTIC_ON: True | |
| OVERLAP_THRESHOLD: 0.8 | |
| OBJECT_MASK_THRESHOLD: 0.8 | |
| SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false | |
| # Spatial sampler | |
| STROKE_SAMPLER: | |
| MAX_CANDIDATE: 1 | |
| CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only | |
| CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"] | |
| DILATION: 3 | |
| CIRCLE: | |
| NUM_STROKES: 5 | |
| STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small'] | |
| STROKE_PROB: [0.33, 0.33, 0.33] | |
| SCRIBBLE: | |
| NUM_STROKES: 5 | |
| STROKE_PRESET: ['rand_curve', 'rand_curve_small'] | |
| STROKE_PROB: [0.5, 0.5] | |
| POINT: | |
| NUM_POINTS: 20 | |
| POLYGON: | |
| MAX_POINTS: 9 | |
| EVAL: | |
| MODE: 'best' # best/random/best_random | |
| NEGATIVE: False | |
| MAX_ITER: 20 | |
| IOU_ITER: 1 | |
| GROUNDING: False | |
| # Multi-modal Architecture, order matters | |
| ATTENTION_ARCH: | |
| VARIABLE: | |
| queries: ['object', 'grounding', 'spatial'] | |
| tokens: ['grounding', 'spatial'] | |
| memories: ['spatial'] | |
| SELF_ATTENTION: | |
| queries: | |
| object: ['queries_object'] | |
| grounding: ['queries_grounding', 'tokens_grounding'] | |
| spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial'] | |
| tokens: | |
| grounding: ['queries_grounding', 'tokens_grounding'] | |
| spatial: ['tokens_spatial'] | |
| memories: | |
| spatial: ['memories_spatial'] | |
| CROSS_ATTENTION: | |
| queries: | |
| object: True | |
| grounding: True | |
| spatial: True | |
| memories: | |
| spatial: True | |
| tokens: | |
| grounding: False | |
| spatial: False | |
| MASKING: ['tokens_spatial', 'tokens_grounding'] | |
| DUPLICATION: | |
| queries: | |
| grounding: 'queries_object' | |
| spatial: 'queries_object' | |
| SPATIAL_MEMORIES: 32 | |
| QUERY_NUMBER: 3 | |
| DATASETS: | |
| TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",] | |
| # TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",] | |
| TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"] # to evaluate instance and semantic performance as well | |
| # TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] | |
| # TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] | |
| # TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] | |
| # TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] | |
| CLASS_CONCAT: false | |
| SIZE_DIVISIBILITY: 32 | |
| PROPOSAL_FILES_TRAIN: [] | |
| INPUT: | |
| PIXEL_MEAN: [123.675, 116.280, 103.530] | |
| PIXEL_STD: [58.395, 57.120, 57.375] | |
| TRAIN: | |
| ASPECT_RATIO_GROUPING: true | |
| BATCH_SIZE_TOTAL: 4 | |
| BATCH_SIZE_PER_GPU: 4 | |
| SHUFFLE: true | |
| TEST: | |
| DETECTIONS_PER_IMAGE: 100 | |
| NAME: coco_eval | |
| IOU_TYPE: ['bbox', 'segm'] | |
| USE_MULTISCALE: false | |
| BATCH_SIZE_TOTAL: 8 | |
| MODEL_FILE: '' | |
| AUG: | |
| ENABLED: False | |
| DATALOADER: | |
| FILTER_EMPTY_ANNOTATIONS: False | |
| NUM_WORKERS: 8 | |
| LOAD_PROPOSALS: False | |
| SAMPLER_TRAIN: "TrainingSampler" | |
| ASPECT_RATIO_GROUPING: True | |
| COCO: | |
| INPUT: | |
| MIN_SIZE_TRAIN: 800 | |
| MAX_SIZE_TRAIN: 1333 | |
| MIN_SIZE_TRAIN_SAMPLING: 'choice' | |
| MIN_SIZE_TEST: 800 | |
| MAX_SIZE_TEST: 1333 | |
| IMAGE_SIZE: 1024 | |
| MIN_SCALE: 0.1 | |
| MAX_SCALE: 2.0 | |
| DATASET_MAPPER_NAME: "coco_interactive" | |
| IGNORE_VALUE: 255 | |
| COLOR_AUG_SSD: False | |
| SIZE_DIVISIBILITY: 32 | |
| RANDOM_FLIP: "horizontal" | |
| MASK_FORMAT: "polygon" | |
| FORMAT: "RGB" | |
| CROP: | |
| ENABLED: True | |
| DATASET: | |
| DATASET: 'coco' | |
| # Validation dataset | |
| ADE20K: | |
| INPUT: | |
| MIN_SIZE_TRAIN: 640 | |
| MIN_SIZE_TRAIN_SAMPLING: "choice" | |
| MIN_SIZE_TEST: 640 | |
| MAX_SIZE_TRAIN: 2560 | |
| MAX_SIZE_TEST: 2560 | |
| MASK_FORMAT: "polygon" | |
| CROP: | |
| ENABLED: True | |
| TYPE: "absolute" | |
| SIZE: (640, 640) | |
| SINGLE_CATEGORY_MAX_AREA: 1.0 | |
| COLOR_AUG_SSD: True | |
| SIZE_DIVISIBILITY: 640 # used in dataset mapper | |
| DATASET_MAPPER_NAME: "mask_former_panoptic" | |
| FORMAT: "RGB" | |
| DATASET: | |
| DATASET: 'ade' | |
| SBD: | |
| INPUT: | |
| MIN_SIZE_TEST: 800 | |
| MAX_SIZE_TEST: 1333 | |
| DATALOADER: | |
| FILTER_EMPTY_ANNOTATIONS: False | |
| NUM_WORKERS: 0 | |
| LOAD_PROPOSALS: False | |
| SAMPLER_TRAIN: "TrainingSampler" | |
| ASPECT_RATIO_GROUPING: False | |
| TEST: | |
| BATCH_SIZE_TOTAL: 1 | |
| VOC: | |
| INPUT: | |
| MIN_SIZE_TEST: 800 | |
| MAX_SIZE_TEST: 1333 | |
| DATALOADER: | |
| FILTER_EMPTY_ANNOTATIONS: False | |
| NUM_WORKERS: 0 | |
| LOAD_PROPOSALS: False | |
| SAMPLER_TRAIN: "TrainingSampler" | |
| ASPECT_RATIO_GROUPING: False | |
| TEST: | |
| BATCH_SIZE_TOTAL: 8 | |
| DAVIS: | |
| INPUT: | |
| MIN_SIZE_TEST: 800 | |
| MAX_SIZE_TEST: 1333 | |
| DATALOADER: | |
| FILTER_EMPTY_ANNOTATIONS: False | |
| NUM_WORKERS: 0 | |
| LOAD_PROPOSALS: False | |
| SAMPLER_TRAIN: "TrainingSampler" | |
| ASPECT_RATIO_GROUPING: False | |
| TEST: | |
| BATCH_SIZE_TOTAL: 8 | |
| VOS: | |
| INPUT: | |
| MIN_SIZE_TEST: 800 | |
| MAX_SIZE_TEST: 1333 | |
| DATALOADER: | |
| FILTER_EMPTY_ANNOTATIONS: False | |
| NUM_WORKERS: 0 | |
| LOAD_PROPOSALS: False | |
| SAMPLER_TRAIN: "TrainingSampler" | |
| ASPECT_RATIO_GROUPING: False | |
| TEST: | |
| BATCH_SIZE_TOTAL: 1 | |
| REF: | |
| INPUT: | |
| PIXEL_MEAN: [123.675, 116.280, 103.530] | |
| PIXEL_STD: [58.395, 57.120, 57.375] | |
| MIN_SIZE_TEST: 512 | |
| MAX_SIZE_TEST: 1024 | |
| FORMAT: "RGB" | |
| SPATIAL: False | |
| DATALOADER: | |
| FILTER_EMPTY_ANNOTATIONS: False | |
| NUM_WORKERS: 4 | |
| LOAD_PROPOSALS: False | |
| SAMPLER_TRAIN: "TrainingSampler" | |
| ASPECT_RATIO_GROUPING: False | |
| TEST: | |
| BATCH_SIZE_TOTAL: 8 | |
| # Detectron2 training config for optimizer and lr scheduler | |
| SOLVER: | |
| BASE_LR: 0.0001 | |
| STEPS: [0.88889, 0.96296] | |
| MAX_ITER: 1 | |
| GAMMA: 0.1 | |
| WARMUP_FACTOR: 1.0 | |
| WARMUP_ITERS: 10 | |
| WARMUP_METHOD: "linear" | |
| WEIGHT_DECAY: 0.05 | |
| OPTIMIZER: "ADAMW" | |
| LR_SCHEDULER_NAME: "WarmupMultiStepLR" | |
| LR_MULTIPLIER: | |
| backbone: 0.1 | |
| lang_encoder: 0.1 | |
| FIX_PARAM: | |
| backbone: True | |
| lang_encoder: True | |
| pixel_decoder: True | |
| WEIGHT_DECAY_NORM: 0.0 | |
| WEIGHT_DECAY_EMBED: 0.0 | |
| CLIP_GRADIENTS: | |
| ENABLED: True | |
| CLIP_TYPE: "full_model" | |
| CLIP_VALUE: 5.0 # 0.01 | |
| NORM_TYPE: 2.0 | |
| MAX_NUM_EPOCHS: 50 |