maelic
/

REACT-pp-VG150

+seed: 42
+metric_to_track: mR
+dtype: float16
+output_dir: ./checkpoints/VG/react++_yolo12m
+glove_dir: datasets
+verbose: INFO
+paths_catalog: ''
+paths_data: ''
+input:
+  img_size:
+  - 640
+  - 640
+  pixel_mean:
+  - 102.9801
+  - 115.9465
+  - 122.7717
+  pixel_std:
+  - 1.0
+  - 1.0
+  - 1.0
+  to_bgr255: true
+  flip_prob_train: 0.5
+  padding: true
+  brightness: 0.0
+  contrast: 0.0
+  saturation: 0.0
+  hue: 0.0
+  vertical_flip_prob_train: 0.0
+datasets:
+  name: "VG150"
+  type: "coco"
+  data_dir: "datasets/VG150/VG150_coco_format/"
+dataloader:
+  num_workers: 8
+  size_divisibility: 32
+  aspect_ratio_grouping: true
+model:
+  flip_aug: false
+  rpn_only: false
+  mask_on: false
+  attribute_on: false
+  relation_on: true
+  device: cuda
+  meta_architecture: GeneralizedYOLO
+  cls_agnostic_bbox_reg: false
+  weight: ''
+  pretrained_detector_ckpt: /home/maelicneau/Documents/SGG-Benchmark/checkpoints/BACKBONES/VG150_yolo12m/weights/best.pt
+  text_embedding: glove.6B
+  box_head: false
+  backbone:
+    type: yolo
+    extra_config: ''
+    freeze_conv_body_at: 2
+    nms_thresh: 0.001
+    freeze: true
+    freeze_at: 10
+  fpn:
+    use_gn: false
+    use_relu: false
+  group_norm:
+    dim_per_gp: -1
+    num_groups: 32
+    epsilon: 1.0e-05
+  yolo:
+    weights: ''
+    size: yolo12m
+    img_size: 640
+    out_channels:
+    - 256
+    - 512
+    - 512
+  rpn:
+    use_fpn: false
+    rpn_mid_channel: 512
+    anchor_sizes:
+    - 32
+    - 64
+    - 128
+    - 256
+    - 512
+    anchor_stride:
+    - 16
+    aspect_ratios:
+    - 0.5
+    - 1.0
+    - 2.0
+    straddle_thresh: 0
+    fg_iou_threshold: 0.7
+    bg_iou_threshold: 0.3
+    batch_size_per_image: 256
+    positive_fraction: 0.5
+    pre_nms_top_n_train: 12000
+    pre_nms_top_n_test: 6000
+    post_nms_top_n_train: 2000
+    post_nms_top_n_test: 1000
+    min_size: 0
+    fpn_post_nms_top_n_train: 2000
+    fpn_post_nms_top_n_test: 2000
+    fpn_post_nms_per_batch: true
+    rpn_head: SingleConvRPNHead
+  roi_heads:
+    fg_iou_threshold: 0.5
+    bg_iou_threshold: 0.3
+    bbox_reg_weights:
+    - 10.0
+    - 10.0
+    - 5.0
+    - 5.0
+    batch_size_per_image: 256
+    positive_fraction: 0.25
+    score_thresh: 0.01
+    nms: 0.5
+    post_nms_per_cls_topn: 300
+    nms_filter_duplicates: false
+    detections_per_img: 100
+  roi_box_head:
+    feature_extractor: DAMPBoxFeatureExtractor
+    predictor: FastRCNNPredictor
+    pooler_resolution: 14
+    pooler_sampling_ratio: 0
+    pooler_scales:
+    - 0.0625
+    mlp_head_dim: 256
+    use_gn: false
+    dilation: 1
+    conv_head_dim: 256
+    num_stacked_convs: 4
+    num_classes: 151
+    patch_size: 32
+    feat_idx_multiscale: true
+    feat_idx_neighbors: 1
+  roi_attribute_head:
+    feature_extractor: FPN2MLPFeatureExtractor
+    predictor: FPNPredictor
+    share_box_feature_extractor: true
+    use_binary_loss: true
+    attribute_loss_weight: 0.1
+    num_attributes: 201
+    max_attributes: 10
+    attribute_bgfg_sample: true
+    attribute_bgfg_ratio: 3
+    pos_weight: 5.0
+  roi_mask_head:
+    feature_extractor: ResNet50Conv5ROIFeatureExtractor
+    predictor: MaskRCNNC4Predictor
+    pooler_resolution: 14
+    pooler_sampling_ratio: 0
+    pooler_scales:
+    - 0.0625
+    mlp_head_dim: 1024
+    conv_layers:
+    - 256
+    - 256
+    - 256
+    - 256
+    resolution: 14
+    share_box_feature_extractor: true
+    postprocess_masks: false
+    postprocess_masks_threshold: 0.5
+    dilation: 1
+    use_gn: false
+  roi_relation_head:
+    predictor: REACTPlusPlusPredictor
+    feature_extractor: P5SceneContextExtractor
+    use_union_features: true
+    use_spatial_features: true
+    use_union_features_inference: true
+    union_dropout: 0.0
+    max_pairs_inference: 0
+    textual_features_only: false
+    visual_features_only: false
+    logit_adjustment: false
+    logit_adjustment_tau: 0.3
+    pooling_all_levels: true
+    batch_size_per_image: 512
+    positive_fraction: 0.35
+    use_gt_box: false
+    use_gt_object_label: false
+    embed_dim: 200
+    context_dropout_rate: 0.2
+    context_hidden_dim: 512
+    context_pooling_dim: 4096
+    context_obj_layer: 1
+    context_rel_layer: 1
+    mlp_head_dim: 512
+    loss:
+      loss_type: BalancedLogitAdjustedLoss
+      beta: 0.999
+      gamma: 0.0
+      alpha: 0.25
+      fg_boost: 2.0
+      fg_weight: 1.0
+      label_smoothing_epsilon: 0.01
+      logit_adjustment_tau: 0.5
+      bg_discount: 0.4
+      ccl_weight: 0.1
+      decisive_margin: 2.0
+      poly_epsilon: 0.0
+      label_smoothing: 0.1
+      sampler_aux_loss_weight: 0.1
+      attn_entropy_weight: 0.01
+      offset_reg_weight: 0.005
+      containment_loss_weight: 0.02
+    num_classes: 51
+    decoder_depth: 1
+    transformer_depth: 1
+    num_rel_layers: 2
+    use_scene_context: true
+    use_geo_bias: true
+    use_cls_emb: true
+    use_geo_enc: true
+    max_pairs_per_img: 512
+    num_queries: 64
+    use_cross_attention: true
+    attn_type: standard
+    geometric_loss_weight: 0.0
+    num_sample_points: 6
+    num_sample_heads: 6
+    feature_strategy: multi_scale
+    use_rmsnorm: true
+    use_swiglu: true
+    clip_rel_path: ''
+    react_loss_weights:
+      l21_loss: 1.0
+      dist_loss2: 0.1
+      loss_dis: 0.5
+    transformer:
+      dropout_rate: 0.1
+      obj_layer: 4
+      rel_layer: 2
+      num_head: 8
+      inner_dim: 2048
+      key_dim: 64
+      val_dim: 64
+    squat_module:
+      pre_norm: false
+      num_decoder: 3
+      rho: 0.35
+      beta: 0.7
+      pretrain_mask: false
+      pretrain_mask_epoch: 1
+    causal:
+      effect_analysis: false
+      fusion_type: sum
+      context_layer: motifs
+      separate_spatial: false
+      effect_type: none
+      spatial_for_vision: false
+    label_smoothing_loss: false
+    use_frequency_bias: false
+    require_box_overlap: false
+    num_sample_per_gt_rel: 8
+    add_gtbox_to_proposal_in_train: true
+    classifier: linear
+    predict_use_vision: false
+    use_bg_discounting: false
+    bg_discounting_threshold: 0.1
+  resnets:
+    num_groups: 1
+    width_per_group: 64
+    stride_in_1x1: true
+    trans_func: BottleneckWithFixedBatchNorm
+    stem_func: StemWithFixedBatchNorm
+    res5_dilation: 1
+    backbone_out_channels: 1024
+    res2_out_channels: 256
+    stem_out_channels: 64
+solver:
+  max_iter: 0
+  max_epoch: 20
+  base_lr: 0.0002
+  bias_lr_factor: 1
+  momentum: 0.9
+  weight_decay: 0.05
+  weight_decay_bias: 0.0
+  clip_norm: 5.0
+  gamma: 0.5
+  steps:
+  - 41000
+  - 50000
+  warmup_factor: 0.1
+  warmup_epochs: 2
+  warmup_method: linear
+  checkpoint_period: 500
+  grad_norm_clip: 1.0
+  print_grad_freq: 500
+  to_val: true
+  pre_val: true
+  val_period: 500
+  update_schedule_during_load: false
+  ims_per_batch: 16
+  optimizer: ADAMW
+  slow_ratio: 10.0
+  deform_offset_slow_ratio: 1.0
+  muon_scaling: 0.2
+  adamw_scaling: 0.8
+  schedule:
+    type: WarmupCosineAnnealingIterLR
+    patience: 2
+    threshold: 0.0001
+    cooldown: 1
+    factor: 0.5
+    max_decay_step: 7
+    eta_min: 5.0e-07
+    plateau_epochs: 5
+  accum_steps: 2
+test:
+  expected_results: []
+  expected_results_sigma_tol: 4
+  ims_per_batch: 1
+  detections_per_img: 100
+  informative: false
+  bbox_aug:
+    enabled: false
+    h_flip: false
+    scales: []
+    max_size: 4000
+    scale_h_flip: false
+  save_proposals: false
+  relation:
+    multiple_preds: false
+    iou_threshold: 0.5
+    require_overlap: false
+    later_nms_prediction_thres: 0.5
+    sync_gather: true
+  allow_load_from_cache: false
+  top_k: 100
+  custum_eval: false
+  custum_path: ''
+global_setting:
+  basic_encoder: Cross-Attention
+  gcl_setting:
+    group_split_mode: divide4
+    knowledge_transfer_mode: KL_logit_TopDown
+    no_relation_restrain: false
+    zero_label_padding_mode: false
+    knowledge_loss_coefficient: 1.0