Upload yolo12l/config.yml with huggingface_hub

a5ee45b verified about 1 month ago

7.54 kB

	seed: 42
	metric_to_track: mR
	dtype: float32
	output_dir: ./checkpoints/PSG/react++_yolo12l
	glove_dir: datasets
	verbose: INFO
	paths_catalog: ''
	paths_data: ''
	input:
	img_size: [640, 640] # [W, H]

	pixel_mean:
	- 102.9801
	- 115.9465
	- 122.7717
	pixel_std:
	- 1.0
	- 1.0
	- 1.0
	to_bgr255: true
	flip_prob_train: 0.5
	padding: true
	brightness: 0.15
	contrast: 0.15
	saturation: 0.1
	hue: 0.0
	vertical_flip_prob_train: 0.0
	datasets:
	name: "PSG"
	type: "coco"
	data_dir: "datasets/PSG/coco_format"
	dataloader:
	num_workers: 8
	size_divisibility: 32
	aspect_ratio_grouping: true
	model:
	flip_aug: false
	rpn_only: false
	mask_on: false
	attribute_on: false
	relation_on: true
	device: cuda
	meta_architecture: GeneralizedYOLO
	cls_agnostic_bbox_reg: false
	weight: ''
	pretrained_detector_ckpt: ./checkpoints/BACKBONES/last.pt
	text_embedding: glove.6B
	box_head: false
	backbone:
	type: yolo
	extra_config: ''
	freeze_conv_body_at: 2
	nms_thresh: 0.001
	freeze: true
	freeze_at: 10
	fpn:
	use_gn: false
	use_relu: false
	group_norm:
	dim_per_gp: -1
	num_groups: 32
	epsilon: 1.0e-05
	yolo:
	weights: ''
	size: yolo12l
	img_size: 640
	out_channels:
	- 256
	- 512
	- 512
	rpn:
	use_fpn: false
	rpn_mid_channel: 512
	anchor_sizes:
	- 32
	- 64
	- 128
	- 256
	- 512
	anchor_stride:
	- 16
	aspect_ratios:
	- 0.5
	- 1.0
	- 2.0
	straddle_thresh: 0
	fg_iou_threshold: 0.7
	bg_iou_threshold: 0.3
	batch_size_per_image: 256
	positive_fraction: 0.5
	pre_nms_top_n_train: 12000
	pre_nms_top_n_test: 6000
	post_nms_top_n_train: 2000
	post_nms_top_n_test: 1000
	min_size: 0
	fpn_post_nms_top_n_train: 2000
	fpn_post_nms_top_n_test: 2000
	fpn_post_nms_per_batch: true
	rpn_head: SingleConvRPNHead
	roi_heads:
	fg_iou_threshold: 0.35
	bg_iou_threshold: 0.3
	bbox_reg_weights:
	- 10.0
	- 10.0
	- 5.0
	- 5.0
	batch_size_per_image: 256
	positive_fraction: 0.25
	score_thresh: 0.01
	nms: 0.5
	post_nms_per_cls_topn: 300
	nms_filter_duplicates: false
	detections_per_img: 100
	roi_box_head:
	feature_extractor: DAMPBoxFeatureExtractor
	predictor: FastRCNNPredictor
	pooler_resolution: 14
	pooler_sampling_ratio: 0
	pooler_scales:
	- 0.0625
	mlp_head_dim: 256
	use_gn: false
	dilation: 1
	conv_head_dim: 256
	num_stacked_convs: 4
	num_classes: 134
	patch_size: 32
	feat_idx_multiscale: true
	feat_idx_neighbors: 1
	roi_attribute_head:
	feature_extractor: FPN2MLPFeatureExtractor
	predictor: FPNPredictor
	share_box_feature_extractor: true
	use_binary_loss: true
	attribute_loss_weight: 0.1
	num_attributes: 201
	max_attributes: 10
	attribute_bgfg_sample: true
	attribute_bgfg_ratio: 3
	pos_weight: 5.0
	roi_mask_head:
	feature_extractor: ResNet50Conv5ROIFeatureExtractor
	predictor: MaskRCNNC4Predictor
	pooler_resolution: 14
	pooler_sampling_ratio: 0
	pooler_scales:
	- 0.0625
	mlp_head_dim: 1024
	conv_layers:
	- 256
	- 256
	- 256
	- 256
	resolution: 14
	share_box_feature_extractor: true
	postprocess_masks: false
	postprocess_masks_threshold: 0.5
	dilation: 1
	use_gn: false
	roi_relation_head:
	predictor: REACTPlusPlusPredictor
	feature_extractor: P5SceneContextExtractor
	use_union_features: true
	use_spatial_features: true
	use_union_features_inference: true
	union_dropout: 0.0
	max_pairs_inference: 0
	textual_features_only: false
	visual_features_only: false
	logit_adjustment: false
	logit_adjustment_tau: 0.3
	pooling_all_levels: true
	batch_size_per_image: 512
	positive_fraction: 0.35
	use_gt_box: false
	use_gt_object_label: false
	embed_dim: 200
	context_dropout_rate: 0.2
	context_hidden_dim: 512
	context_pooling_dim: 4096
	context_obj_layer: 1
	context_rel_layer: 1
	mlp_head_dim: 512
	loss:
	loss_type: BalancedLogitAdjustedLoss
	beta: 0.999
	gamma: 0.0
	alpha: 0.15
	fg_boost: 2.0
	fg_weight: 1.0
	label_smoothing_epsilon: 0.01
	logit_adjustment_tau: 0.5
	bg_discount: 0.3
	ccl_weight: 0.1
	decisive_margin: 2.0
	poly_epsilon: 0.0
	label_smoothing: 0.1
	sampler_aux_loss_weight: 0.1
	attn_entropy_weight: 0.01
	offset_reg_weight: 0.005
	containment_loss_weight: 0.02
	num_classes: 57
	decoder_depth: 1
	transformer_depth: 1
	num_rel_layers: 2
	use_scene_context: true
	use_geo_bias: true
	use_cls_emb: true
	use_geo_enc: true
	max_pairs_per_img: 512
	num_queries: 64
	use_cross_attention: true
	attn_type: standard
	geometric_loss_weight: 0.0
	num_sample_points: 6
	num_sample_heads: 6
	feature_strategy: multi_scale
	use_rmsnorm: true
	use_swiglu: true
	clip_rel_path: ''
	react_loss_weights:
	l21_loss: 1.0
	dist_loss2: 0.1
	loss_dis: 0.5
	transformer:
	dropout_rate: 0.1
	obj_layer: 4
	rel_layer: 2
	num_head: 8
	inner_dim: 2048
	key_dim: 64
	val_dim: 64
	squat_module:
	pre_norm: false
	num_decoder: 3
	rho: 0.35
	beta: 0.7
	pretrain_mask: false
	pretrain_mask_epoch: 1
	causal:
	effect_analysis: false
	fusion_type: sum
	context_layer: motifs
	separate_spatial: false
	effect_type: none
	spatial_for_vision: false
	label_smoothing_loss: false
	use_frequency_bias: false
	require_box_overlap: false
	num_sample_per_gt_rel: 8
	add_gtbox_to_proposal_in_train: false
	classifier: linear
	predict_use_vision: false
	use_bg_discounting: false
	bg_discounting_threshold: 0.1
	resnets:
	num_groups: 1
	width_per_group: 64
	stride_in_1x1: true
	trans_func: BottleneckWithFixedBatchNorm
	stem_func: StemWithFixedBatchNorm
	res5_dilation: 1
	backbone_out_channels: 1024
	res2_out_channels: 256
	stem_out_channels: 64
	solver:
	max_iter: 0
	max_epoch: 10
	base_lr: 0.0001
	bias_lr_factor: 1
	momentum: 0.9
	weight_decay: 0.05
	weight_decay_bias: 0.0
	clip_norm: 5.0
	gamma: 0.5
	steps:
	- 41000
	- 50000
	warmup_factor: 0.1
	warmup_epochs: 1
	warmup_method: linear
	checkpoint_period: 250
	grad_norm_clip: 1.0
	print_grad_freq: 250
	to_val: true
	pre_val: true
	val_period: 250
	update_schedule_during_load: false
	ims_per_batch: 8
	optimizer: ADAMW
	slow_ratio: 10.0
	deform_offset_slow_ratio: 1.0
	muon_scaling: 0.2
	adamw_scaling: 0.8
	schedule:
	type: WarmupCosineAnnealingIterLR
	patience: 2
	threshold: 0.0001
	cooldown: 1
	factor: 0.5
	max_decay_step: 7
	eta_min: 5.0e-07
	plateau_epochs: 5
	accum_steps: 4
	test:
	expected_results: []
	expected_results_sigma_tol: 4
	ims_per_batch: 1
	detections_per_img: 100
	informative: false
	bbox_aug:
	enabled: false
	h_flip: false
	scales: []
	max_size: 4000
	scale_h_flip: false
	save_proposals: false
	relation:
	multiple_preds: false
	iou_threshold: 0.5
	require_overlap: false
	later_nms_prediction_thres: 0.5
	sync_gather: true
	allow_load_from_cache: false
	top_k: 100
	custum_eval: false
	custum_path: ''
	global_setting:
	basic_encoder: Cross-Attention
	gcl_setting:
	group_split_mode: divide4
	knowledge_transfer_mode: KL_logit_TopDown
	no_relation_restrain: false
	zero_label_padding_mode: false
	knowledge_loss_coefficient: 1.0