| | import argparse |
| | import time |
| | import yaml |
| | import os |
| | import numpy as np |
| |
|
| | def parse_opts(): |
| | parser = argparse.ArgumentParser() |
| |
|
| | |
| | parser.add_argument('--cfg_path', type=str, required=True, help='config file') |
| | parser.add_argument('--id', type=str, default='', help='id of this run. Results and logs will saved in this folder ./save/id') |
| | parser.add_argument('--gpu_id', type=str, nargs='+', default=[]) |
| | parser.add_argument('--disable_tqdm', action='store_true') |
| | parser.add_argument('--seed', type=int, default=777) |
| | parser.add_argument('--random_seed', action='store_true', help='choose a random seed from {1,...,1000}') |
| | parser.add_argument('--disable_cudnn', type=int, default=0, help='disable cudnn may solve some unknown bugs') |
| | parser.add_argument('--debug', action='store_true', help='using mini-dataset for fast debugging') |
| | parser.add_argument('--device', default='cuda', choices=['cpu', 'cuda'], help='device to use for training / testing') |
| | parser.add_argument('--map', action='store_true', default=False, help='map a100 data path to 3090 data path') |
| | |
| |
|
| | |
| | parser.add_argument('--train_caption_file', type=str, |
| | default='data/anet/captiondata/train_modified.json', help='') |
| | parser.add_argument('--invalid_video_json', type=str, nargs='+', default=[]) |
| | parser.add_argument('--val_caption_file', type=str, default='data/anet/captiondata/val_1.json') |
| | parser.add_argument('--visual_feature_folder', type=str, default='data/anet/resnet_bn') |
| | parser.add_argument('--text_feature_folder', type=str, default=None) |
| | parser.add_argument('--gt_file_for_auc', type=str, nargs='+', default='data/anet/captiondata/val_all.json') |
| | parser.add_argument('--gt_file_for_eval', type=str, nargs='+', default=['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json']) |
| | parser.add_argument('--gt_file_for_para_eval', type=str, nargs='+', default= ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json']) |
| | parser.add_argument('--dict_file', type=str, default='data/anet/vocabulary_activitynet.json', help='') |
| | parser.add_argument('--criteria_for_best_ckpt', type=str, default='overall', choices=['dvc', 'pc', 'overall'], help='for dense video captioning, use soda_c + METEOR as the criteria' |
| | 'for paragraph captioning, choose the best para_METEOR+para_CIDEr+para_BLEU4' |
| | 'for overall, select BLEU4 + METEOR + F1_score') |
| |
|
| | parser.add_argument('--visual_feature_type', type=str, default='c3d', choices=['c3d', 'resnet_bn', 'resnet', 'UniVL', 'CLIP', 'CLIP-ViP']) |
| | parser.add_argument('--feature_dim', type=int, default=500, help='dim of frame-level feature vector') |
| |
|
| | parser.add_argument('--start_from', type=str, default='', help='id of the run with incompleted training') |
| | parser.add_argument('--start_from_mode', type=str, choices=['best', 'last'], default="last") |
| | parser.add_argument('--pretrain', type=str, choices=['full', 'encoder', 'decoder']) |
| | parser.add_argument('--pretrain_path', type=str, default='', help='path of .pth') |
| |
|
| | |
| | parser.add_argument('--nthreads', type=int, default=4) |
| | parser.add_argument('--data_norm', type=int, default=0) |
| | parser.add_argument('--data_rescale', type=int, default=1) |
| |
|
| | parser.add_argument('--feature_sample_rate', type=int, default=1) |
| | parser.add_argument('--train_proposal_sample_num', type=int, |
| | default=24, |
| | help='number of sampled proposals (or proposal sequence), a bigger value may be better') |
| | parser.add_argument('--gt_proposal_sample_num', type=int, default=30) |
| | parser.add_argument('--ft_gt_percent', type=float, default=1.0, help='the percentage of gt samples used in pbox+gt setting. 1.0 means using all gt samples in yc2/tasty.') |
| | parser.add_argument('--pre_percent', type=float, default=1.0, help='the percentage of gt samples used in pbox+gt setting. 1.0 means using all gt samples in yc2/tasty.') |
| |
|
| |
|
| | |
| | parser.add_argument('--vocab_size', type=int, default=5747) |
| | parser.add_argument('--wordRNN_input_feats_type', type=str, default='C', choices=['C', 'E', 'C+E'], |
| | help='C:clip-level features, E: event-level features, C+E: both') |
| | parser.add_argument('--caption_decoder_type', type=str, default="light", |
| | choices=['none','light', 'standard']) |
| | parser.add_argument('--rnn_size', type=int, default=512, |
| | help='size of the rnn in number of hidden nodes in each layer') |
| | parser.add_argument('--num_layers', type=int, default=1, help='number of layers in the RNN') |
| | parser.add_argument('--input_encoding_size', type=int, default=512, |
| | help='the encoding size of each token in the vocabulary') |
| | parser.add_argument('--att_hid_size', type=int, default=512, help='the hidden size of the attention MLP') |
| | parser.add_argument('--drop_prob', type=float, default=0.5, help='strength of dropout in the Language Model RNN') |
| | parser.add_argument('--max_caption_len', type=int, default=30, help='') |
| |
|
| | |
| | parser.add_argument('--hidden_dim', type=int, default=512) |
| | parser.add_argument('--num_queries', type=int, default=100) |
| | parser.add_argument('--hidden_dropout_prob', type=float, default=0.5) |
| | parser.add_argument('--layer_norm_eps', type=float, default=1e-12) |
| | parser.add_argument('--caption_cost_type', type=str, default='loss') |
| | parser.add_argument('--set_cost_caption', type=float, default=0) |
| | parser.add_argument('--set_cost_class', type=float, default=1) |
| | parser.add_argument('--set_cost_bbox', type=float, default=5) |
| | parser.add_argument('--set_cost_giou', type=float, default=2) |
| | parser.add_argument('--cost_alpha', type=float, default=0.25) |
| | parser.add_argument('--cost_gamma', type=float, default=2) |
| |
|
| | parser.add_argument('--bbox_loss_coef', default=5, type=float) |
| | parser.add_argument('--giou_loss_coef', default=2, type=float) |
| | parser.add_argument('--count_loss_coef', default=0, type=float) |
| | parser.add_argument('--caption_loss_coef', default=0, type=float) |
| | parser.add_argument('--eos_coef', default=0.1, type=float, |
| | help="Relative classification weight of the no-object class") |
| | parser.add_argument('--num_classes', type=int, default=1) |
| | parser.add_argument('--dec_layers', type=int, default=6) |
| | parser.add_argument('--enc_layers', type=int, default=6) |
| | parser.add_argument('--transformer_ff_dim', type=int, default=2048) |
| | parser.add_argument('--transformer_dropout_prob', type=float, default=0.1) |
| | parser.add_argument('--frame_embedding_num', type=int, default = 100) |
| | parser.add_argument('--sample_method', type=str, default = 'nearest', choices=['nearest', 'linear']) |
| | parser.add_argument('--fix_xcw', type=int, default=0) |
| |
|
| | |
| | parser.add_argument('--use_anchor', default=False, action='store_true') |
| | parser.add_argument('--random_anchor_init', default=True, action='store_false') |
| | parser.add_argument('--prior_anchor_duration_init', default=True, action='store_false') |
| |
|
| | |
| | parser.add_argument('--matcher_type', type=str, default='default', choices=['default', 'DTW', 'Sim']) |
| | |
| | parser.add_argument('--pretrained_language_model', type=str, default='UniVL', \ |
| | choices=['UniVL', 'CLIP', 'CLIP-ViP'], help='Pretrained hugging face model') |
| | parser.add_argument('--text_hidden_dim', type=int, default=768, help='hidden dim of text encoder') |
| | parser.add_argument('--max_text_input_len', type=int, default=32, help='') |
| | parser.add_argument('--max_pos_num', type=int, default=500) |
| | parser.add_argument('--huggingface_cache_dir', type=str, default='.cache') |
| | parser.add_argument('--text_encoder_learning_strategy', type=str, default='frozen',choices=('frozen')) |
| |
|
| | |
| | parser.add_argument('--use_pseudo_box', default=False, action='store_true') |
| | parser.add_argument('--pseudo_box_type', type=str, default='similarity', choices=['align', 'similarity', 'weight_sim', 'weight_index', 'modeframe']) |
| |
|
| | |
| | parser.add_argument('--top_frames', type=int, default=15) |
| | parser.add_argument('--window_size', type=int, default=2) |
| | parser.add_argument('--statistic_mode', type=str, default='median', choices=['mode', 'median']) |
| | parser.add_argument('--width_ratio', type=float, default=-1) |
| | parser.add_argument('--beta', type=float, default=1, help="weight for overlap loss") |
| | parser.add_argument('--width_th', type=float, default=0.5, help="threshold for width") |
| | parser.add_argument('--iteration', type=int, default=3, help="iteration for pseudo box generation") |
| | |
| | parser.add_argument('--pseudo_box_aug', default=False, action='store_true') |
| | parser.add_argument('--pseudo_box_aug_num', type=int, default=5) |
| | parser.add_argument('--pseudo_box_aug_ratio', type=float, default=0.1) |
| | parser.add_argument('--pseudo_box_aug_mode', default='random', choices=['random', 'uniform']) |
| | parser.add_argument('--refine_pseudo_box', default=False, action='store_true') |
| | parser.add_argument('--use_additional_score_layer', default=False, action='store_true') |
| | parser.add_argument('--use_additional_cap_layer', default=False, action='store_true') |
| | parser.add_argument('--merge_k_boxes', type=int, default=3) |
| | parser.add_argument('--merge_criterion', type=str, choices=['cap_topk', 'ins_topk', 'ins_cap_topk'], default='cap_topk') |
| | parser.add_argument('--merge_mode', type=str, choices=['weighted_sum, interpolate'], default='weighted_sum') |
| | parser.add_argument('--refine_pseudo_stage_num', type=int, default=2) |
| | parser.add_argument('--use_query_box_for_refine', default=False, action='store_true') |
| | parser.add_argument('--norm_ins_score', default='sigmoid', choices=['sigmoid', 'softmax']) |
| | parser.add_argument('--cap_prob_clip', default=False, action='store_true') |
| | parser.add_argument('--use_neg_pseudo_box', default=False, action='store_true') |
| | parser.add_argument('--num_neg_box', default=10, type=int) |
| | parser.add_argument('--weighted_mil_loss', default=False, action='store_true') |
| | parser.add_argument('--focal_mil', default=False, action='store_true') |
| | parser.add_argument('--disable_rematch', default=False, action='store_true') |
| | parser.add_argument('--start_refine_epoch', default=-1, type=int) |
| | |
| |
|
| | |
| | parser.add_argument('--align_keep_percentile', type=float, default=0.1) |
| | parser.add_argument('--align_top_band_size', type=int, default=0) |
| | parser.add_argument('--align_drop_z', type=int, default=0) |
| | parser.add_argument('--align_one_to_many', default=False, action='store_true') |
| | parser.add_argument('--align_many_to_one', default=False, action='store_true') |
| | parser.add_argument('--align_contiguous', default=False, action='store_true') |
| | |
| | |
| | parser.add_argument('--set_cost_sim', type=float, default=1.0) |
| |
|
| | |
| | parser.add_argument('--enable_contrastive', default=False, action='store_true', help='enable contrastive learning') |
| | parser.add_argument('--disable_contrastive_projection', default=False, action='store_true', help='disable contrastive projection layers') |
| | parser.add_argument('--contrastive_hidden_size', type=int, default=128, help='Contrastive hidden size') |
| | parser.add_argument('--contrastive_loss_start_coef', type=float, default=0.1, help='Weight of contrastive loss') |
| | parser.add_argument('--contrastive_loss_temperature', type=float, default=0.1, help='Temperature of cl temperature') |
| | parser.add_argument('--enable_cross_video_cl', type=bool, default=True, help='Enable cross video contrastive loss') |
| | parser.add_argument('--enable_e2t_cl', default=True, action='store_true', help=' enable event-to-text contrastive') |
| | parser.add_argument('--enable_bg_for_cl', default=True, action='store_true', help=' add a class for background events') |
| | parser.add_argument('--set_cost_cl', type=float, default=0.0) |
| | parser.add_argument('--cl_schedule_val', type=float, nargs='+', default=[0, 0.1]) |
| | parser.add_argument('--cl_schedule_time', type=int, nargs='+', default=[0, 2]) |
| |
|
| |
|
| |
|
| | |
| | parser.add_argument('--prior_manner', type=str, default='all', choices=['add', 'all']) |
| |
|
| | |
| | parser.add_argument('--training_scheme', type=str, default='all', choices=['cap_head_only', 'no_cap_head', 'all']) |
| | parser.add_argument('--epoch', type=int, default=25) |
| | parser.add_argument('--batch_size', type=int, default=1, help='batch_size') |
| | parser.add_argument('--batch_size_for_eval', type=int, default=1, help='') |
| | parser.add_argument('--grad_clip', type=float, default=100., help='clip gradients at this value') |
| | parser.add_argument('--optimizer_type', type=str, default='adam') |
| | parser.add_argument('--weight_decay', type=float, default=0, help='weight_decay') |
| |
|
| | parser.add_argument('--lr', type=float, default=1e-4, help='1e-4 for resnet feature and 5e-5 for C3D feature') |
| | parser.add_argument('--learning_rate_decay_start', type=float, default=8) |
| | parser.add_argument('--learning_rate_decay_every', type=float, default=3) |
| | parser.add_argument('--learning_rate_decay_rate', type=float, default=0.5) |
| |
|
| | |
| | parser.add_argument('--min_epoch_when_save', type=int, default=-1) |
| | parser.add_argument('--save_checkpoint_every', type=int, default=1) |
| | parser.add_argument('--save_all_checkpoint', action='store_true') |
| | parser.add_argument('--save_dir', type=str, default='/mnt/data/pjlab-3090-sport/wuhao/logs/dibs', help='directory to store checkpointed models') |
| |
|
| | |
| | parser.add_argument('--lr_backbone_names', default=["None"], type=str, nargs='+') |
| | parser.add_argument('--lr_backbone', default=2e-5, type=float) |
| | parser.add_argument('--lr_proj', default=0, type=int) |
| | parser.add_argument('--lr_linear_proj_names', default=['reference_points', 'sampling_offsets'], type=str, nargs='+') |
| | parser.add_argument('--lr_linear_proj_mult', default=0.1, type=float) |
| |
|
| | |
| | parser.add_argument('--with_box_refine', default=False, action='store_true') |
| | parser.add_argument('--transformer_input_type', default='queries', choices=['gt_proposals', 'prior_proposals', 'learnt_proposals', 'queries']) |
| |
|
| | |
| | parser.add_argument('--backbone', default=None, type=str, |
| | help="Name of the convolutional backbone to use") |
| | parser.add_argument('--dilation', action='store_true', |
| | help="If true, we replace stride with dilation in the last convolutional block (DC5)") |
| | parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), |
| | help="Type of positional embedding to use on top of the image features") |
| | parser.add_argument('--position_embedding_scale', default=2 * np.pi, type=float, |
| | help="position / size * scale") |
| | parser.add_argument('--num_feature_levels', default=4, type=int, help='number of feature levels') |
| |
|
| | |
| |
|
| | parser.add_argument('--nheads', default=8, type=int, |
| | help="Number of attention heads inside the transformer's attentions") |
| | parser.add_argument('--dec_n_points', default=4, type=int) |
| | parser.add_argument('--enc_n_points', default=4, type=int) |
| |
|
| | parser.add_argument('--share_caption_head', type = int ,default=1) |
| |
|
| | parser.add_argument('--cap_nheads', default=8, type=int) |
| | parser.add_argument('--cap_dec_n_points', default=4, type=int) |
| | parser.add_argument('--cap_num_feature_levels', default=4, type=int) |
| | parser.add_argument('--disable_mid_caption_heads', action='store_true') |
| |
|
| | |
| | parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false', |
| | help="Disables auxiliary decoding losses (loss at each layer)") |
| |
|
| |
|
| | |
| |
|
| | parser.add_argument('--cls_loss_coef', default=2, type=float) |
| | parser.add_argument('--self_iou_loss_coef', default=0.0, type=float) |
| | parser.add_argument('--ref_rank_loss_coef', default=0.1, type=float) |
| | parser.add_argument('--mil_loss_coef', default=1.0, type=float) |
| | parser.add_argument('--focal_alpha', default=0.25, type=float) |
| | parser.add_argument('--focal_gamma', default=2., type=float) |
| |
|
| |
|
| | |
| | parser.add_argument('--max_eseq_length', default=10, type=int) |
| | parser.add_argument('--lloss_gau_mask', default=1, type=int) |
| | parser.add_argument('--lloss_beta', default=1, type=float) |
| |
|
| | |
| | parser.add_argument('--scheduled_sampling_start', type=int, default=-1, |
| | help='at what iteration to start decay gt probability') |
| | parser.add_argument('--basic_ss_prob', type=float, default=0, help='initial ss prob') |
| | parser.add_argument('--scheduled_sampling_increase_every', type=int, default=2, |
| | help='every how many iterations thereafter to gt probability') |
| | parser.add_argument('--scheduled_sampling_increase_prob', type=float, default=0.05, |
| | help='How much to update the prob') |
| | parser.add_argument('--scheduled_sampling_max_prob', type=float, default=0.25, |
| | help='Maximum scheduled sampling prob.') |
| |
|
| | |
| | parser.add_argument('--ec_alpha', type=float, default=0.3) |
| | args = parser.parse_args() |
| |
|
| | if args.cfg_path: |
| | import_cfg(args.cfg_path, vars(args)) |
| |
|
| | if args.random_seed: |
| | import random |
| | seed = int(random.random() * 1000) |
| | new_id = args.id + '_seed{}'.format(seed) |
| | save_folder = os.path.join(args.save_dir, new_id) |
| | while os.path.exists(save_folder): |
| | seed = int(random.random() * 1000) |
| | new_id = args.id + '_seed{}'.format(seed) |
| | save_folder = os.path.join(args.save_dir, new_id) |
| | args.id = new_id |
| | args.seed = seed |
| |
|
| | if args.debug: |
| | args.id = 'debug_' + time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) |
| | args.save_checkpoint_every = 1 |
| | args.shuffle = 0 |
| |
|
| | if args.caption_decoder_type == 'none': |
| | assert args.caption_loss_coef == 0 |
| | assert args.set_cost_caption == 0 |
| |
|
| | print("args.id: {}".format(args.id)) |
| | return args |
| |
|
| | def import_cfg(cfg_path, args): |
| | with open(cfg_path, 'r') as handle: |
| | yml = yaml.load(handle, Loader=yaml.FullLoader) |
| | if 'base_cfg_path' in yml: |
| | base_cfg_path = yml['base_cfg_path'] |
| | import_cfg(base_cfg_path, args) |
| | args.update(yml) |
| | pass |
| | if __name__ == '__main__': |
| | opt = parse_opts() |
| | print(opt) |