REACTPlusPlus_PSG / yolo12l /config.yml
maelic's picture
Upload yolo12l/config.yml with huggingface_hub
a5ee45b verified
seed: 42
metric_to_track: mR
dtype: float32
output_dir: ./checkpoints/PSG/react++_yolo12l
glove_dir: datasets
verbose: INFO
paths_catalog: ''
paths_data: ''
input:
img_size: [640, 640] # [W, H]
pixel_mean:
- 102.9801
- 115.9465
- 122.7717
pixel_std:
- 1.0
- 1.0
- 1.0
to_bgr255: true
flip_prob_train: 0.5
padding: true
brightness: 0.15
contrast: 0.15
saturation: 0.1
hue: 0.0
vertical_flip_prob_train: 0.0
datasets:
name: "PSG"
type: "coco"
data_dir: "datasets/PSG/coco_format"
dataloader:
num_workers: 8
size_divisibility: 32
aspect_ratio_grouping: true
model:
flip_aug: false
rpn_only: false
mask_on: false
attribute_on: false
relation_on: true
device: cuda
meta_architecture: GeneralizedYOLO
cls_agnostic_bbox_reg: false
weight: ''
pretrained_detector_ckpt: ./checkpoints/BACKBONES/last.pt
text_embedding: glove.6B
box_head: false
backbone:
type: yolo
extra_config: ''
freeze_conv_body_at: 2
nms_thresh: 0.001
freeze: true
freeze_at: 10
fpn:
use_gn: false
use_relu: false
group_norm:
dim_per_gp: -1
num_groups: 32
epsilon: 1.0e-05
yolo:
weights: ''
size: yolo12l
img_size: 640
out_channels:
- 256
- 512
- 512
rpn:
use_fpn: false
rpn_mid_channel: 512
anchor_sizes:
- 32
- 64
- 128
- 256
- 512
anchor_stride:
- 16
aspect_ratios:
- 0.5
- 1.0
- 2.0
straddle_thresh: 0
fg_iou_threshold: 0.7
bg_iou_threshold: 0.3
batch_size_per_image: 256
positive_fraction: 0.5
pre_nms_top_n_train: 12000
pre_nms_top_n_test: 6000
post_nms_top_n_train: 2000
post_nms_top_n_test: 1000
min_size: 0
fpn_post_nms_top_n_train: 2000
fpn_post_nms_top_n_test: 2000
fpn_post_nms_per_batch: true
rpn_head: SingleConvRPNHead
roi_heads:
fg_iou_threshold: 0.35
bg_iou_threshold: 0.3
bbox_reg_weights:
- 10.0
- 10.0
- 5.0
- 5.0
batch_size_per_image: 256
positive_fraction: 0.25
score_thresh: 0.01
nms: 0.5
post_nms_per_cls_topn: 300
nms_filter_duplicates: false
detections_per_img: 100
roi_box_head:
feature_extractor: DAMPBoxFeatureExtractor
predictor: FastRCNNPredictor
pooler_resolution: 14
pooler_sampling_ratio: 0
pooler_scales:
- 0.0625
mlp_head_dim: 256
use_gn: false
dilation: 1
conv_head_dim: 256
num_stacked_convs: 4
num_classes: 134
patch_size: 32
feat_idx_multiscale: true
feat_idx_neighbors: 1
roi_attribute_head:
feature_extractor: FPN2MLPFeatureExtractor
predictor: FPNPredictor
share_box_feature_extractor: true
use_binary_loss: true
attribute_loss_weight: 0.1
num_attributes: 201
max_attributes: 10
attribute_bgfg_sample: true
attribute_bgfg_ratio: 3
pos_weight: 5.0
roi_mask_head:
feature_extractor: ResNet50Conv5ROIFeatureExtractor
predictor: MaskRCNNC4Predictor
pooler_resolution: 14
pooler_sampling_ratio: 0
pooler_scales:
- 0.0625
mlp_head_dim: 1024
conv_layers:
- 256
- 256
- 256
- 256
resolution: 14
share_box_feature_extractor: true
postprocess_masks: false
postprocess_masks_threshold: 0.5
dilation: 1
use_gn: false
roi_relation_head:
predictor: REACTPlusPlusPredictor
feature_extractor: P5SceneContextExtractor
use_union_features: true
use_spatial_features: true
use_union_features_inference: true
union_dropout: 0.0
max_pairs_inference: 0
textual_features_only: false
visual_features_only: false
logit_adjustment: false
logit_adjustment_tau: 0.3
pooling_all_levels: true
batch_size_per_image: 512
positive_fraction: 0.35
use_gt_box: false
use_gt_object_label: false
embed_dim: 200
context_dropout_rate: 0.2
context_hidden_dim: 512
context_pooling_dim: 4096
context_obj_layer: 1
context_rel_layer: 1
mlp_head_dim: 512
loss:
loss_type: BalancedLogitAdjustedLoss
beta: 0.999
gamma: 0.0
alpha: 0.15
fg_boost: 2.0
fg_weight: 1.0
label_smoothing_epsilon: 0.01
logit_adjustment_tau: 0.5
bg_discount: 0.3
ccl_weight: 0.1
decisive_margin: 2.0
poly_epsilon: 0.0
label_smoothing: 0.1
sampler_aux_loss_weight: 0.1
attn_entropy_weight: 0.01
offset_reg_weight: 0.005
containment_loss_weight: 0.02
num_classes: 57
decoder_depth: 1
transformer_depth: 1
num_rel_layers: 2
use_scene_context: true
use_geo_bias: true
use_cls_emb: true
use_geo_enc: true
max_pairs_per_img: 512
num_queries: 64
use_cross_attention: true
attn_type: standard
geometric_loss_weight: 0.0
num_sample_points: 6
num_sample_heads: 6
feature_strategy: multi_scale
use_rmsnorm: true
use_swiglu: true
clip_rel_path: ''
react_loss_weights:
l21_loss: 1.0
dist_loss2: 0.1
loss_dis: 0.5
transformer:
dropout_rate: 0.1
obj_layer: 4
rel_layer: 2
num_head: 8
inner_dim: 2048
key_dim: 64
val_dim: 64
squat_module:
pre_norm: false
num_decoder: 3
rho: 0.35
beta: 0.7
pretrain_mask: false
pretrain_mask_epoch: 1
causal:
effect_analysis: false
fusion_type: sum
context_layer: motifs
separate_spatial: false
effect_type: none
spatial_for_vision: false
label_smoothing_loss: false
use_frequency_bias: false
require_box_overlap: false
num_sample_per_gt_rel: 8
add_gtbox_to_proposal_in_train: false
classifier: linear
predict_use_vision: false
use_bg_discounting: false
bg_discounting_threshold: 0.1
resnets:
num_groups: 1
width_per_group: 64
stride_in_1x1: true
trans_func: BottleneckWithFixedBatchNorm
stem_func: StemWithFixedBatchNorm
res5_dilation: 1
backbone_out_channels: 1024
res2_out_channels: 256
stem_out_channels: 64
solver:
max_iter: 0
max_epoch: 10
base_lr: 0.0001
bias_lr_factor: 1
momentum: 0.9
weight_decay: 0.05
weight_decay_bias: 0.0
clip_norm: 5.0
gamma: 0.5
steps:
- 41000
- 50000
warmup_factor: 0.1
warmup_epochs: 1
warmup_method: linear
checkpoint_period: 250
grad_norm_clip: 1.0
print_grad_freq: 250
to_val: true
pre_val: true
val_period: 250
update_schedule_during_load: false
ims_per_batch: 8
optimizer: ADAMW
slow_ratio: 10.0
deform_offset_slow_ratio: 1.0
muon_scaling: 0.2
adamw_scaling: 0.8
schedule:
type: WarmupCosineAnnealingIterLR
patience: 2
threshold: 0.0001
cooldown: 1
factor: 0.5
max_decay_step: 7
eta_min: 5.0e-07
plateau_epochs: 5
accum_steps: 4
test:
expected_results: []
expected_results_sigma_tol: 4
ims_per_batch: 1
detections_per_img: 100
informative: false
bbox_aug:
enabled: false
h_flip: false
scales: []
max_size: 4000
scale_h_flip: false
save_proposals: false
relation:
multiple_preds: false
iou_threshold: 0.5
require_overlap: false
later_nms_prediction_thres: 0.5
sync_gather: true
allow_load_from_cache: false
top_k: 100
custum_eval: false
custum_path: ''
global_setting:
basic_encoder: Cross-Attention
gcl_setting:
group_split_mode: divide4
knowledge_transfer_mode: KL_logit_TopDown
no_relation_restrain: false
zero_label_padding_mode: false
knowledge_loss_coefficient: 1.0