| seed: 42 |
| metric_to_track: mR |
| dtype: float32 |
| output_dir: ./checkpoints/PSG/react++_yolo12l |
| glove_dir: datasets |
| verbose: INFO |
| paths_catalog: '' |
| paths_data: '' |
| input: |
| img_size: [640, 640] |
|
|
| pixel_mean: |
| - 102.9801 |
| - 115.9465 |
| - 122.7717 |
| pixel_std: |
| - 1.0 |
| - 1.0 |
| - 1.0 |
| to_bgr255: true |
| flip_prob_train: 0.5 |
| padding: true |
| brightness: 0.15 |
| contrast: 0.15 |
| saturation: 0.1 |
| hue: 0.0 |
| vertical_flip_prob_train: 0.0 |
| datasets: |
| name: "PSG" |
| type: "coco" |
| data_dir: "datasets/PSG/coco_format" |
| dataloader: |
| num_workers: 8 |
| size_divisibility: 32 |
| aspect_ratio_grouping: true |
| model: |
| flip_aug: false |
| rpn_only: false |
| mask_on: false |
| attribute_on: false |
| relation_on: true |
| device: cuda |
| meta_architecture: GeneralizedYOLO |
| cls_agnostic_bbox_reg: false |
| weight: '' |
| pretrained_detector_ckpt: ./checkpoints/BACKBONES/last.pt |
| text_embedding: glove.6B |
| box_head: false |
| backbone: |
| type: yolo |
| extra_config: '' |
| freeze_conv_body_at: 2 |
| nms_thresh: 0.001 |
| freeze: true |
| freeze_at: 10 |
| fpn: |
| use_gn: false |
| use_relu: false |
| group_norm: |
| dim_per_gp: -1 |
| num_groups: 32 |
| epsilon: 1.0e-05 |
| yolo: |
| weights: '' |
| size: yolo12l |
| img_size: 640 |
| out_channels: |
| - 256 |
| - 512 |
| - 512 |
| rpn: |
| use_fpn: false |
| rpn_mid_channel: 512 |
| anchor_sizes: |
| - 32 |
| - 64 |
| - 128 |
| - 256 |
| - 512 |
| anchor_stride: |
| - 16 |
| aspect_ratios: |
| - 0.5 |
| - 1.0 |
| - 2.0 |
| straddle_thresh: 0 |
| fg_iou_threshold: 0.7 |
| bg_iou_threshold: 0.3 |
| batch_size_per_image: 256 |
| positive_fraction: 0.5 |
| pre_nms_top_n_train: 12000 |
| pre_nms_top_n_test: 6000 |
| post_nms_top_n_train: 2000 |
| post_nms_top_n_test: 1000 |
| min_size: 0 |
| fpn_post_nms_top_n_train: 2000 |
| fpn_post_nms_top_n_test: 2000 |
| fpn_post_nms_per_batch: true |
| rpn_head: SingleConvRPNHead |
| roi_heads: |
| fg_iou_threshold: 0.35 |
| bg_iou_threshold: 0.3 |
| bbox_reg_weights: |
| - 10.0 |
| - 10.0 |
| - 5.0 |
| - 5.0 |
| batch_size_per_image: 256 |
| positive_fraction: 0.25 |
| score_thresh: 0.01 |
| nms: 0.5 |
| post_nms_per_cls_topn: 300 |
| nms_filter_duplicates: false |
| detections_per_img: 100 |
| roi_box_head: |
| feature_extractor: DAMPBoxFeatureExtractor |
| predictor: FastRCNNPredictor |
| pooler_resolution: 14 |
| pooler_sampling_ratio: 0 |
| pooler_scales: |
| - 0.0625 |
| mlp_head_dim: 256 |
| use_gn: false |
| dilation: 1 |
| conv_head_dim: 256 |
| num_stacked_convs: 4 |
| num_classes: 134 |
| patch_size: 32 |
| feat_idx_multiscale: true |
| feat_idx_neighbors: 1 |
| roi_attribute_head: |
| feature_extractor: FPN2MLPFeatureExtractor |
| predictor: FPNPredictor |
| share_box_feature_extractor: true |
| use_binary_loss: true |
| attribute_loss_weight: 0.1 |
| num_attributes: 201 |
| max_attributes: 10 |
| attribute_bgfg_sample: true |
| attribute_bgfg_ratio: 3 |
| pos_weight: 5.0 |
| roi_mask_head: |
| feature_extractor: ResNet50Conv5ROIFeatureExtractor |
| predictor: MaskRCNNC4Predictor |
| pooler_resolution: 14 |
| pooler_sampling_ratio: 0 |
| pooler_scales: |
| - 0.0625 |
| mlp_head_dim: 1024 |
| conv_layers: |
| - 256 |
| - 256 |
| - 256 |
| - 256 |
| resolution: 14 |
| share_box_feature_extractor: true |
| postprocess_masks: false |
| postprocess_masks_threshold: 0.5 |
| dilation: 1 |
| use_gn: false |
| roi_relation_head: |
| predictor: REACTPlusPlusPredictor |
| feature_extractor: P5SceneContextExtractor |
| use_union_features: true |
| use_spatial_features: true |
| use_union_features_inference: true |
| union_dropout: 0.0 |
| max_pairs_inference: 0 |
| textual_features_only: false |
| visual_features_only: false |
| logit_adjustment: false |
| logit_adjustment_tau: 0.3 |
| pooling_all_levels: true |
| batch_size_per_image: 512 |
| positive_fraction: 0.35 |
| use_gt_box: false |
| use_gt_object_label: false |
| embed_dim: 200 |
| context_dropout_rate: 0.2 |
| context_hidden_dim: 512 |
| context_pooling_dim: 4096 |
| context_obj_layer: 1 |
| context_rel_layer: 1 |
| mlp_head_dim: 512 |
| loss: |
| loss_type: BalancedLogitAdjustedLoss |
| beta: 0.999 |
| gamma: 0.0 |
| alpha: 0.15 |
| fg_boost: 2.0 |
| fg_weight: 1.0 |
| label_smoothing_epsilon: 0.01 |
| logit_adjustment_tau: 0.5 |
| bg_discount: 0.3 |
| ccl_weight: 0.1 |
| decisive_margin: 2.0 |
| poly_epsilon: 0.0 |
| label_smoothing: 0.1 |
| sampler_aux_loss_weight: 0.1 |
| attn_entropy_weight: 0.01 |
| offset_reg_weight: 0.005 |
| containment_loss_weight: 0.02 |
| num_classes: 57 |
| decoder_depth: 1 |
| transformer_depth: 1 |
| num_rel_layers: 2 |
| use_scene_context: true |
| use_geo_bias: true |
| use_cls_emb: true |
| use_geo_enc: true |
| max_pairs_per_img: 512 |
| num_queries: 64 |
| use_cross_attention: true |
| attn_type: standard |
| geometric_loss_weight: 0.0 |
| num_sample_points: 6 |
| num_sample_heads: 6 |
| feature_strategy: multi_scale |
| use_rmsnorm: true |
| use_swiglu: true |
| clip_rel_path: '' |
| react_loss_weights: |
| l21_loss: 1.0 |
| dist_loss2: 0.1 |
| loss_dis: 0.5 |
| transformer: |
| dropout_rate: 0.1 |
| obj_layer: 4 |
| rel_layer: 2 |
| num_head: 8 |
| inner_dim: 2048 |
| key_dim: 64 |
| val_dim: 64 |
| squat_module: |
| pre_norm: false |
| num_decoder: 3 |
| rho: 0.35 |
| beta: 0.7 |
| pretrain_mask: false |
| pretrain_mask_epoch: 1 |
| causal: |
| effect_analysis: false |
| fusion_type: sum |
| context_layer: motifs |
| separate_spatial: false |
| effect_type: none |
| spatial_for_vision: false |
| label_smoothing_loss: false |
| use_frequency_bias: false |
| require_box_overlap: false |
| num_sample_per_gt_rel: 8 |
| add_gtbox_to_proposal_in_train: false |
| classifier: linear |
| predict_use_vision: false |
| use_bg_discounting: false |
| bg_discounting_threshold: 0.1 |
| resnets: |
| num_groups: 1 |
| width_per_group: 64 |
| stride_in_1x1: true |
| trans_func: BottleneckWithFixedBatchNorm |
| stem_func: StemWithFixedBatchNorm |
| res5_dilation: 1 |
| backbone_out_channels: 1024 |
| res2_out_channels: 256 |
| stem_out_channels: 64 |
| solver: |
| max_iter: 0 |
| max_epoch: 10 |
| base_lr: 0.0001 |
| bias_lr_factor: 1 |
| momentum: 0.9 |
| weight_decay: 0.05 |
| weight_decay_bias: 0.0 |
| clip_norm: 5.0 |
| gamma: 0.5 |
| steps: |
| - 41000 |
| - 50000 |
| warmup_factor: 0.1 |
| warmup_epochs: 1 |
| warmup_method: linear |
| checkpoint_period: 250 |
| grad_norm_clip: 1.0 |
| print_grad_freq: 250 |
| to_val: true |
| pre_val: true |
| val_period: 250 |
| update_schedule_during_load: false |
| ims_per_batch: 8 |
| optimizer: ADAMW |
| slow_ratio: 10.0 |
| deform_offset_slow_ratio: 1.0 |
| muon_scaling: 0.2 |
| adamw_scaling: 0.8 |
| schedule: |
| type: WarmupCosineAnnealingIterLR |
| patience: 2 |
| threshold: 0.0001 |
| cooldown: 1 |
| factor: 0.5 |
| max_decay_step: 7 |
| eta_min: 5.0e-07 |
| plateau_epochs: 5 |
| accum_steps: 4 |
| test: |
| expected_results: [] |
| expected_results_sigma_tol: 4 |
| ims_per_batch: 1 |
| detections_per_img: 100 |
| informative: false |
| bbox_aug: |
| enabled: false |
| h_flip: false |
| scales: [] |
| max_size: 4000 |
| scale_h_flip: false |
| save_proposals: false |
| relation: |
| multiple_preds: false |
| iou_threshold: 0.5 |
| require_overlap: false |
| later_nms_prediction_thres: 0.5 |
| sync_gather: true |
| allow_load_from_cache: false |
| top_k: 100 |
| custum_eval: false |
| custum_path: '' |
| global_setting: |
| basic_encoder: Cross-Attention |
| gcl_setting: |
| group_split_mode: divide4 |
| knowledge_transfer_mode: KL_logit_TopDown |
| no_relation_restrain: false |
| zero_label_padding_mode: false |
| knowledge_loss_coefficient: 1.0 |
|
|