Exclibur commited on
Commit
7191c6b
·
verified ·
1 Parent(s): 1c42358

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. backup/cfgs/howto-anet_anet_clip_topk20_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  2. backup/cfgs/howto-anet_anet_clip_topk20_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml +19 -0
  3. backup/cfgs/howto-anet_anet_clip_topk20_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  4. backup/cfgs/howto-anet_anet_clip_topk20_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml +19 -0
  5. backup/cfgs/howto-anet_anet_clip_topk30_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  6. backup/cfgs/howto-anet_anet_clip_topk30_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml +19 -0
  7. backup/cfgs/howto-anet_anet_clip_topk30_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  8. backup/cfgs/howto-anet_anet_clip_topk30_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml +19 -0
  9. backup/cfgs/howto-anet_anet_clip_topk40_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  10. backup/cfgs/howto-anet_anet_clip_topk40_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml +19 -0
  11. backup/cfgs/howto-anet_anet_clip_topk40_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  12. backup/cfgs/howto-anet_anet_clip_topk40_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml +19 -0
  13. backup/cfgs_base/howto/base_howto-anet_anet_mixlm.yml +64 -0
  14. backup/cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml +61 -0
  15. backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_CLIP_mixlm.yml +42 -0
  16. backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_CLIP_refine_mixlm.yml +46 -0
  17. backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_UniVL_mixlm.yml +42 -0
  18. backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_UniVL_refine_mixlm.yml +46 -0
  19. backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_mixlm.yml +42 -0
  20. backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_mixlm.yml +46 -0
  21. backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_UniVL_mixlm.yml +42 -0
  22. backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_UniVL_refine_mixlm.yml +46 -0
  23. backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_CLIP_mixlm.yml +42 -0
  24. backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_CLIP_refine_mixlm.yml +46 -0
  25. backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_UniVL_mixlm.yml +42 -0
  26. backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_UniVL_refine_mixlm.yml +46 -0
  27. backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_CLIP_mixlm.yml +42 -0
  28. backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_CLIP_refine_mixlm.yml +46 -0
  29. backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_CLIP_refine_mixlm_v0.yml +46 -0
  30. backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_mixlm.yml +42 -0
  31. backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_puyu.yml +1 -1
  32. backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_refine_mixlm.yml +46 -0
  33. backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_refine_puyu.yml +1 -1
  34. backup/cfgs_base/howto/howto_yc2_ori_(sim_op_order_v2)_UniVL_refine.yml +1 -1
  35. backup/cfgs_ft_gt/howto-anet_anet_clip_topk30_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  36. backup/cfgs_ft_gt/howto-anet_anet_clip_topk30_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap_rand2.yml +19 -0
  37. backup/cfgs_ft_gt/howto-yc2_yc2_univl_topk25_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap.yml +19 -0
  38. backup/cfgs_ft_gt/howto-yc2_yc2_univl_topk25_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap_rand2.yml +19 -0
  39. backup/change_config_add.py +8 -6
  40. backup/misc/__pycache__/utils.cpython-38.pyc +0 -0
  41. backup/misc/utils.py +7 -2
  42. backup/opts.py +1 -0
  43. backup/pdvc/__pycache__/pdvc.cpython-38.pyc +0 -0
  44. backup/pdvc/__pycache__/video_segmentation.cpython-38.pyc +0 -0
  45. backup/pdvc/pdvc.py +2 -0
  46. backup/pdvc/video_segmentation.py +59 -0
  47. backup/train.py +102 -11
  48. backup/train_fewshot.py +1 -1
  49. backup/train_ft2_gt.py +6 -5
  50. backup/train_pre_ft_gt.py +2 -2
backup/cfgs/howto-anet_anet_clip_topk20_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 20
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk20_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_puyu.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 20
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk20_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 20
13
+ width_ratio: 2
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk20_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_puyu.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 20
13
+ width_ratio: 2
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk30_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 30
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk30_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_puyu.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 30
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk30_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 30
13
+ width_ratio: 2
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk30_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_puyu.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 30
13
+ width_ratio: 2
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk40_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 40
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk40_r1_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_puyu.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 40
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk40_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 40
13
+ width_ratio: 2
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs/howto-anet_anet_clip_topk40_r2_iter3_th2_refine_aug(8,0.02)_top3_2stage_inscap_puyu.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_puyu.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 40
13
+ width_ratio: 2
14
+ iteration: 3
15
+ width_th: 2
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs_base/howto/base_howto-anet_anet_mixlm.yml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: anet
2
+
3
+ visual_feature_type: c3d
4
+ visual_feature_folder: 'data/anet/features/c3d'
5
+ feature_dim: 500
6
+ invalid_video_json: []
7
+ train_proposal_file: data/generated_proposals/dbg_trainval_top100.json
8
+ eval_proposal_file: data/generated_proposals/dbg_trainval_top100.json
9
+ gt_file_for_eval: ['data/anet/captiondata/val_1.json', 'data/anet/captiondata/val_2.json']
10
+ gt_file_for_para_eval: ['data/anet/captiondata/para/anet_entities_val_1_para.json', 'data/anet/captiondata/para/anet_entities_val_2_para.json']
11
+ train_caption_file: ['data/howto/captiondata/howto100m_train_mixlm.json', 'data/anet/captiondata/train_modified.json']
12
+ val_caption_file: 'data/anet/captiondata/val_1.json'
13
+
14
+ max_caption_len: 50
15
+
16
+ dict_file: data/howto/vocabulary_howto_rate2_mixlm_anet.json
17
+ vocab_size: 18884
18
+ # dict_file_for_sim: data/howto/vocabulary_howto_rate5.json
19
+ # vocab_size: 8531
20
+
21
+
22
+ train_proposal_type: gt
23
+ train_proposal_sample_num: 30
24
+ sample_method: nearest
25
+
26
+ epoch: 10
27
+ batch_size: 1
28
+ lr: 0.00005
29
+ learning_rate_decay_start: 8
30
+ learning_rate_decay_every: 3
31
+ learning_rate_decay_rate: 0.5
32
+ weight_decay: 0.0001
33
+ save_all_checkpoint: 0
34
+
35
+ num_queries: 100
36
+ dec_layers: 2
37
+ enc_layers: 2
38
+ transformer_ff_dim: 512
39
+ transformer_dropout_prob: 0.1
40
+ frame_embedding_num: 100
41
+ caption_decoder_type: light
42
+ att_hid_size: 0
43
+
44
+ with_box_refine: 1
45
+
46
+ fix_xcw: 1
47
+ set_cost_caption: 0
48
+ set_cost_giou: 4
49
+ set_cost_bbox: 0
50
+ set_cost_class: 2
51
+ self_iou_loss_coef: 0
52
+ #cost_alpha: 0.5
53
+ #cost_gamma: 1
54
+ #focal_alpha: 0.5
55
+ #focal_gamma: 1
56
+ caption_loss_coef: 2
57
+ giou_loss_coef: 4
58
+ bbox_loss_coef: 0
59
+ cls_loss_coef: 2
60
+ count_loss_coef: 0.5
61
+ max_eseq_length: 10
62
+ lloss_cross_entropy: 0
63
+ lloss_focal_loss: 0
64
+ lloss_gau_mask: 1
backup/cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: yc2_tsn_pdvcl
2
+
3
+ visual_feature_type: ['resnet', 'bn']
4
+ visual_feature_folder: ['data/yc2/features/resnet_bn/', 'data/yc2/features/resnet_bn/']
5
+ feature_dim: 3072
6
+ invalid_video_json: []
7
+ train_caption_file: ['data/howto/captiondata/howto100m_train_mixlm.json', 'data/yc2/captiondata/yc2_train.json']
8
+ val_caption_file: 'data/yc2/captiondata/yc2_val.json'
9
+ gt_file_for_eval: ['data/yc2/captiondata/yc2_val.json']
10
+ gt_file_for_para_eval: ['data/yc2/captiondata/para/para_yc2_val.json']
11
+ max_caption_len: 50
12
+
13
+ dict_file: data/howto/vocabulary_howto_rate2_mixlm_yc2.json
14
+ vocab_size: 17447
15
+ # dict_file_for_sim: data/howto/vocabulary_howto_rate5.json
16
+ # vocab_size: 8531
17
+
18
+
19
+ train_proposal_type: gt
20
+ train_proposal_sample_num: 30
21
+ sample_method: nearest
22
+
23
+ epoch: 10
24
+ batch_size: 1
25
+ lr: 0.00005
26
+ learning_rate_decay_start: 8
27
+ learning_rate_decay_every: 3
28
+ learning_rate_decay_rate: 0.5
29
+ weight_decay: 0.0001
30
+ save_all_checkpoint: 0
31
+
32
+ num_queries: 100
33
+ dec_layers: 2
34
+ enc_layers: 2
35
+ transformer_ff_dim: 512
36
+ transformer_dropout_prob: 0.1
37
+ frame_embedding_num: 200
38
+ caption_decoder_type: light
39
+ att_hid_size: 0
40
+
41
+ with_box_refine: 1
42
+
43
+ fix_xcw: 1
44
+ set_cost_caption: 0
45
+ set_cost_giou: 4
46
+ set_cost_bbox: 0
47
+ set_cost_class: 2
48
+ self_iou_loss_coef: 0
49
+ #cost_alpha: 0.5
50
+ #cost_gamma: 1
51
+ #focal_alpha: 0.5
52
+ #focal_gamma: 1
53
+ caption_loss_coef: 2
54
+ giou_loss_coef: 4
55
+ bbox_loss_coef: 0
56
+ cls_loss_coef: 2
57
+ count_loss_coef: 0.5
58
+ max_eseq_length: 20
59
+ lloss_cross_entropy: 0
60
+ lloss_focal_loss: 0
61
+ lloss_gau_mask: 1
backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_CLIP_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 1
26
+ pretrained_language_model: CLIP
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_CLIP_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 2
25
+ top_frames: 35
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 1
30
+ pretrained_language_model: CLIP
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_UniVL_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 1
26
+ pretrained_language_model: UniVL
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-anet_anet_anc_(sim_op_order_v2)_UniVL_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 2
25
+ top_frames: 35
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 1
30
+ pretrained_language_model: UniVL
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 0
26
+ pretrained_language_model: CLIP
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/CLIP_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 2
25
+ top_frames: 35
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 0
30
+ pretrained_language_model: CLIP
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_UniVL_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 0
26
+ pretrained_language_model: UniVL
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_UniVL_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-anet_anet_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet/UniVL_feature/text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 2
25
+ top_frames: 35
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 0
30
+ pretrained_language_model: UniVL
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_CLIP_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 1
26
+ pretrained_language_model: CLIP
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_CLIP_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 3
25
+ top_frames: 10
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 1
30
+ pretrained_language_model: CLIP
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_UniVL_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 1
26
+ pretrained_language_model: UniVL
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_anc_(sim_op_order_v2)_UniVL_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 3
25
+ top_frames: 10
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 1
30
+ pretrained_language_model: UniVL
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_CLIP_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 0
26
+ pretrained_language_model: CLIP
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_CLIP_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 3
25
+ top_frames: 10
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 0
30
+ pretrained_language_model: CLIP
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_CLIP_refine_mixlm_v0.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['CLIP']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
6
+ text_feature_folder: ['/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/video/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 3
25
+ top_frames: 10
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 12
28
+
29
+ use_anchor: 0
30
+ pretrained_language_model: CLIP
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_mixlm.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 0
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 0
17
+ refine_pseudo_stage_num: 1
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ use_query_box_for_refine: 0
23
+ gt_proposal_sample_num: 12
24
+
25
+ use_anchor: 0
26
+ pretrained_language_model: UniVL
27
+ disable_contrastive_projection: 1
28
+
29
+ caption_decoder_type: standard
30
+ cap_nheads: 1
31
+ cap_dec_n_points: 4
32
+ cap_num_feature_levels: 4
33
+ soft_attention: 1
34
+ att_hid_size: 512
35
+
36
+ num_queries: 100
37
+
38
+ ec_alpha: 1.0
39
+
40
+ self_iou_loss_coef: 0.0
41
+ ref_rank_loss_coef: 0.0
42
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_puyu.yml CHANGED
@@ -3,7 +3,7 @@ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_puyu.yml
3
 
4
  visual_feature_type: ['UniVL']
5
  visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
- text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
  visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
  text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
  feature_dim: 768
 
3
 
4
  visual_feature_type: ['UniVL']
5
  visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
+ text_feature_folder: ['/mnt/data/pjlab-3090-sport/wuhao/features/howto100m/univl_features/text_puyu', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
  visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
  text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
  feature_dim: 768
backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_refine_mixlm.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_mixlm.yml
3
+
4
+ visual_feature_type: ['UniVL']
5
+ visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
+ text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
+ visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
+ text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
+ feature_dim: 768
10
+ hidden_dim: 512
11
+
12
+ use_pseudo_box: 1
13
+ pseudo_box_aug: 1
14
+ pseudo_box_aug_num: 5
15
+ pseudo_box_aug_ratio: 0.3
16
+ refine_pseudo_box: 1
17
+ refine_pseudo_stage_num: 2
18
+ merge_k_boxes: 3
19
+ pseudo_box_type: similarity_op_order_v2
20
+ iteration: 3
21
+ width_th: 2
22
+ statistic_mode: mode
23
+ width_ratio: 1
24
+ window_size: 3
25
+ top_frames: 10
26
+ use_query_box_for_refine: 0
27
+ gt_proposal_sample_num: 30
28
+
29
+ use_anchor: 0
30
+ pretrained_language_model: UniVL
31
+ disable_contrastive_projection: 1
32
+
33
+ caption_decoder_type: standard
34
+ cap_nheads: 1
35
+ cap_dec_n_points: 4
36
+ cap_num_feature_levels: 4
37
+ soft_attention: 1
38
+ att_hid_size: 512
39
+
40
+ num_queries: 100
41
+
42
+ ec_alpha: 1.0
43
+
44
+ self_iou_loss_coef: 0.0
45
+ ref_rank_loss_coef: 0.0
46
+ contrastive_loss_start_coef: 0.0
backup/cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_refine_puyu.yml CHANGED
@@ -3,7 +3,7 @@ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2_puyu.yml
3
 
4
  visual_feature_type: ['UniVL']
5
  visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
- text_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
  visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
  text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
  feature_dim: 768
 
3
 
4
  visual_feature_type: ['UniVL']
5
  visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
6
+ text_feature_folder: ['/mnt/data/pjlab-3090-sport/wuhao/features/howto100m/univl_features/text_puyu', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
7
  visual_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_visual/']
8
  text_feature_folder_val: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text/']
9
  feature_dim: 768
backup/cfgs_base/howto/howto_yc2_ori_(sim_op_order_v2)_UniVL_refine.yml CHANGED
@@ -1,5 +1,5 @@
1
  id: refine_aug(5,0.3)_top3_1stage
2
- base_cfg_path: cfgs_base/howto/base_howto_yc2.yml
3
 
4
  visual_feature_type: ['UniVL']
5
  visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual']
 
1
  id: refine_aug(5,0.3)_top3_1stage
2
+ base_cfg_path: cfgs_base/howto/base_howto-yc2_yc2.yml
3
 
4
  visual_feature_type: ['UniVL']
5
  visual_feature_folder: ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/visual']
backup/cfgs_ft_gt/howto-anet_anet_clip_topk30_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 30
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 1
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs_ft_gt/howto-anet_anet_clip_topk30_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap_rand2.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-anet_anet_ori_(sim_op_order_v2)_CLIP_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 30
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 1
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs_ft_gt/howto-yc2_yc2_univl_topk25_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 25
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 1
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/cfgs_ft_gt/howto-yc2_yc2_univl_topk25_r1_iter3_th1_refine_aug(8,0.02)_top3_2stage_inscap_rand2.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id: ''
2
+ base_cfg_path: cfgs_base/howto/howto-yc2_yc2_ori_(sim_op_order_v2)_UniVL_refine.yml
3
+
4
+
5
+ pseudo_box_aug_num: 8
6
+ pseudo_box_aug_ratio: 0.02
7
+ pseudo_box_aug_mode: random_range
8
+ refine_pseudo_box: 1
9
+ refine_pseudo_stage_num: 2
10
+ merge_k_boxes: 3
11
+ pseudo_box_type: similarity_op_order_v2
12
+ top_frames: 25
13
+ width_ratio: 1
14
+ iteration: 3
15
+ width_th: 1
16
+ use_query_box_for_refine: 0
17
+ gt_proposal_sample_num: 20
18
+ mil_loss_coef: 0
19
+ merge_criterion: ins_cap_topk
backup/change_config_add.py CHANGED
@@ -12,10 +12,12 @@ args = parser.parse_args()
12
 
13
 
14
  # Define the folder containing YAML files
15
- folder_path = 'cfgs_ref'
16
  # folder_path = 'cfgs_base/anet'
17
  # folder_path = 'cfgs'
18
- file_filter = 'yc2'
 
 
19
 
20
 
21
 
@@ -24,18 +26,18 @@ file_filter = 'yc2'
24
  # find_string = '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj'
25
  # find_string = 'data/yc2/captiondata/yc2'
26
  # find_string = "/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text"
27
- find_string = "UniVL_refine"
28
  # find_string = "pdvc_mode: 0"
29
 
30
  # replace_string = '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/Tasty/features/clip/visual'
31
  # replace_string = '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/Tasty/features/clip/text'
32
  # replace_string = 'data/tasty/captiondata/tasty'
33
  # replace_string = "cfgs_base/tasty/tasty_tsn_pdvcl.yml"
34
- replace_string = "CLIP_refine"
35
  # replace_string = "pdvc_mode: 1"
36
 
37
- old_name = 'univl'
38
- new_name = 'clip'
39
 
40
  def replace_yaml(yaml_file_path, new_file_path, old_string, new_string):
41
  # Read the YAML file as text
 
12
 
13
 
14
  # Define the folder containing YAML files
15
+ # folder_path = 'cfgs_ref'
16
  # folder_path = 'cfgs_base/anet'
17
  # folder_path = 'cfgs'
18
+ folder_path = 'cfgs_yc2_ft_perc'
19
+
20
+ file_filter = ''
21
 
22
 
23
 
 
26
  # find_string = '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/clip_features/text_proj'
27
  # find_string = 'data/yc2/captiondata/yc2'
28
  # find_string = "/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2/UniVL_features/UniVL_text"
29
+ find_string = "ft_gt_percent: 0.25"
30
  # find_string = "pdvc_mode: 0"
31
 
32
  # replace_string = '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/Tasty/features/clip/visual'
33
  # replace_string = '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/Tasty/features/clip/text'
34
  # replace_string = 'data/tasty/captiondata/tasty'
35
  # replace_string = "cfgs_base/tasty/tasty_tsn_pdvcl.yml"
36
+ replace_string = "ft_gt_percent: 0.75"
37
  # replace_string = "pdvc_mode: 1"
38
 
39
+ old_name = 'perc0.25'
40
+ new_name = 'perc0.75'
41
 
42
  def replace_yaml(yaml_file_path, new_file_path, old_string, new_string):
43
  # Read the YAML file as text
backup/misc/__pycache__/utils.cpython-38.pyc CHANGED
Binary files a/backup/misc/__pycache__/utils.cpython-38.pyc and b/backup/misc/__pycache__/utils.cpython-38.pyc differ
 
backup/misc/utils.py CHANGED
@@ -241,7 +241,7 @@ def build_folder(opt):
241
  save_foldername += '_C-layer'
242
  if 'puyu' in opt.train_caption_file[0]:
243
  save_foldername += '_puyu'
244
- elif 'mix' in opt.train_caption_file[0]:
245
  save_foldername += '_mixlm'
246
 
247
  if opt.id != '':
@@ -281,8 +281,13 @@ def build_folder(opt):
281
  return save_folder
282
 
283
 
284
- def backup_envir(save_folder):
 
 
285
  backup_folders = ['cfgs_base', 'cfgs', 'misc', 'pdvc']
 
 
 
286
  backup_files = glob.glob('./*.py')
287
  for folder in backup_folders:
288
  shutil.copytree(folder, os.path.join(save_folder, 'backup', folder))
 
241
  save_foldername += '_C-layer'
242
  if 'puyu' in opt.train_caption_file[0]:
243
  save_foldername += '_puyu'
244
+ elif 'mixlm' in opt.train_caption_file[0]:
245
  save_foldername += '_mixlm'
246
 
247
  if opt.id != '':
 
281
  return save_folder
282
 
283
 
284
+ def backup_envir(save_folder, opt):
285
+ cfg_path = opt.cfg_path
286
+ dir_path = os.path.dirname(cfg_path)
287
  backup_folders = ['cfgs_base', 'cfgs', 'misc', 'pdvc']
288
+ if dir_path not in backup_folders:
289
+ backup_folders.append(dir_path)
290
+
291
  backup_files = glob.glob('./*.py')
292
  for folder in backup_folders:
293
  shutil.copytree(folder, os.path.join(save_folder, 'backup', folder))
backup/opts.py CHANGED
@@ -269,6 +269,7 @@ def parse_opts():
269
 
270
  # reranking
271
  parser.add_argument('--ec_alpha', type=float, default=0.3)
 
272
  args = parser.parse_args()
273
 
274
  if args.cfg_path:
 
269
 
270
  # reranking
271
  parser.add_argument('--ec_alpha', type=float, default=0.3)
272
+ parser.add_argument('--test', action='store_true', default=False)
273
  args = parser.parse_args()
274
 
275
  if args.cfg_path:
backup/pdvc/__pycache__/pdvc.cpython-38.pyc CHANGED
Binary files a/backup/pdvc/__pycache__/pdvc.cpython-38.pyc and b/backup/pdvc/__pycache__/pdvc.cpython-38.pyc differ
 
backup/pdvc/__pycache__/video_segmentation.cpython-38.pyc CHANGED
Binary files a/backup/pdvc/__pycache__/video_segmentation.cpython-38.pyc and b/backup/pdvc/__pycache__/video_segmentation.cpython-38.pyc differ
 
backup/pdvc/pdvc.py CHANGED
@@ -316,6 +316,8 @@ class PDVC(nn.Module):
316
  video_step_alignment = [align_frame_into_steps_op_v1(dt['video_tensor'][i], raw_text_embed[i].to(memory.device), topk=self.opt.top_frames, scale=self.opt.width_ratio, beta=1, order=True, num_iterations=self.opt.iteration) for i in range(N)]
317
  elif self.opt.pseudo_box_type == 'similarity_op_order_v2':
318
  video_step_alignment = [align_frame_into_steps_op_order_v2(dt['video_tensor'][i], raw_text_embed[i].to(memory.device), topk=self.opt.top_frames, threshold=self.opt.width_th, ratio=self.opt.width_ratio, iteration=self.opt.iteration) for i in range(N)]
 
 
319
  elif self.opt.pseudo_box_type == 'weight_sim':
320
  if self.opt.width_ratio < 0:
321
  video_step_alignment = [step_retrieval_weight_sim(dt['video_tensor'][i], raw_text_embed[i].to(memory.device), \
 
316
  video_step_alignment = [align_frame_into_steps_op_v1(dt['video_tensor'][i], raw_text_embed[i].to(memory.device), topk=self.opt.top_frames, scale=self.opt.width_ratio, beta=1, order=True, num_iterations=self.opt.iteration) for i in range(N)]
317
  elif self.opt.pseudo_box_type == 'similarity_op_order_v2':
318
  video_step_alignment = [align_frame_into_steps_op_order_v2(dt['video_tensor'][i], raw_text_embed[i].to(memory.device), topk=self.opt.top_frames, threshold=self.opt.width_th, ratio=self.opt.width_ratio, iteration=self.opt.iteration) for i in range(N)]
319
+ elif self.opt.pseudo_box_type == 'similarity_op_v2':
320
+ video_step_alignment = [align_frame_into_steps_op_v2(dt['video_tensor'][i], raw_text_embed[i].to(memory.device), topk=self.opt.top_frames, threshold=self.opt.width_th, ratio=self.opt.width_ratio, iteration=self.opt.iteration) for i in range(N)]
321
  elif self.opt.pseudo_box_type == 'weight_sim':
322
  if self.opt.width_ratio < 0:
323
  video_step_alignment = [step_retrieval_weight_sim(dt['video_tensor'][i], raw_text_embed[i].to(memory.device), \
backup/pdvc/video_segmentation.py CHANGED
@@ -632,6 +632,65 @@ def align_frame_into_steps_op_order_v2(frame_features, step_features, topk=15, t
632
 
633
  return (best_bbox, min_loss)
634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
 
636
 
637
  # pesudo box 4: based on fixed window. the result is bad. give up
 
632
 
633
  return (best_bbox, min_loss)
634
 
635
+ def align_frame_into_steps_op_v2(frame_features, step_features, topk=15, threshold=0.5, ratio=1, iteration=3):
636
+ # breakpoint()
637
+ if step_features.shape[0] == 0:
638
+ return -np.ones(frame_features.shape[0])
639
+
640
+ sim = compute_sim(step_features, frame_features, config_eval_l2norm).cpu()
641
+ sorted_index = torch.argsort(-sim, dim=1)
642
+ top_indices_list_global = [sorted_index[i][:topk] for i in range(sim.shape[0])]
643
+ top_values_list_global = [sim[i][top_indices_list_global[i]] for i in range(sim.shape[0])]
644
+
645
+
646
+ uniform_boxes = uniform_window(frame_features.shape[0], step_features.shape[0])
647
+
648
+ iter_bbox_loss = {}
649
+ for iter in range(iteration):
650
+ # if iter == 0:
651
+ # refined_uniform_boxes = expand_window(uniform_boxes, frame_features.shape[0], step_features.shape[0], ratio)
652
+ # else:
653
+ # refined_uniform_boxes = expand_window(bbox, frame_features.shape[0], step_features.shape[0], ratio) # last bbox
654
+
655
+
656
+ # global: from all frames, local: from refined uniform boxes
657
+
658
+ # top_indices_list_local = [sorted_index[i][(sorted_index[i] >= refined_uniform_boxes[i][0]) & (sorted_index[i] <= refined_uniform_boxes[i][1])][:topk] for i in range(sim.shape[0])]
659
+ # top_values_list_local = [sim[i][top_indices_list_local[i]] for i in range(sim.shape[0])]
660
+
661
+ # size_local = [len(top_indices_list_local[i]) for i in range(sim.shape[0])]
662
+ # if sum(size_local) < (topk-2) * len(size_local):
663
+ # top_indices_list = top_indices_list_global
664
+ # top_values_list = top_values_list_global
665
+ # else:
666
+ # top_indices_list = top_indices_list_local
667
+ # top_values_list = top_values_list_local
668
+
669
+ # top_indices_list = [top_indices_list_global[i] if len(top_indices_list_local[i]) < topk else top_indices_list_local[i] for i in range(sim.shape[0])]
670
+
671
+ bbox = []
672
+ for i in range(len(top_indices_list_global)):
673
+ filtered_indices = compute_filtered_indices(top_indices_list_global[i].tolist(), top_values_list_global[i].tolist(), threshold)
674
+ if len(filtered_indices) == 0:
675
+ filtered_indices = compute_filtered_indices(top_indices_list_global[i].tolist(), top_indices_list_global[i].tolist(), threshold)
676
+ if len(filtered_indices) == 0:
677
+ bbox.append(uniform_boxes[i])
678
+ continue
679
+ bbox.append([min(filtered_indices), max(filtered_indices)])
680
+
681
+ # compute bbox loss
682
+ bbox_loss_list = [compute_bbox_loss(top_indices_list_global[i], bbox[i], top_values_list_global[i]) for i in range(len(top_indices_list_global))]
683
+ bbox_loss = sum(bbox_loss_list)
684
+ iter_bbox_loss[iter] = {'loss': bbox_loss, 'bbox': bbox}
685
+
686
+ # select the minimum bbox loss and bbox as output
687
+ min_loss_iter = min(iter_bbox_loss.keys(), key=lambda k: iter_bbox_loss[k]['loss'])
688
+ min_loss = iter_bbox_loss[min_loss_iter]['loss']
689
+ best_bbox = iter_bbox_loss[min_loss_iter]['bbox']
690
+
691
+
692
+ return (best_bbox, min_loss)
693
+
694
 
695
 
696
  # pesudo box 4: based on fixed window. the result is bad. give up
backup/train.py CHANGED
@@ -48,8 +48,8 @@ def construct_save_path(opt, save_folder="/mnt/data/pjlab-3090-sport/wuhao/code/
48
  if len(opt.train_caption_file) == 2:
49
  if 'puyu' in opt.train_caption_file[0]:
50
  elements.append('howto_puyu')
51
- elif 'mix' in opt.train_caption_file[0]:
52
- elements.append('howto_mix')
53
  else:
54
  elements.append('howto_llama2')
55
  elements.append('howto')
@@ -65,8 +65,8 @@ def construct_save_path(opt, save_folder="/mnt/data/pjlab-3090-sport/wuhao/code/
65
  elif 'howto' in opt.train_caption_file:
66
  if 'puyu' in opt.train_caption_file:
67
  elements.append('howto_puyu')
68
- elif 'mix' in opt.train_caption_file:
69
- elements.append('howto_mix')
70
  else:
71
  elements.append('howto_llama2')
72
  # elements.append('howto')
@@ -114,8 +114,97 @@ def train(opt):
114
  logger = create_logger(save_folder, 'train.log')
115
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  if not opt.start_from:
118
- backup_envir(save_folder)
119
  logger.info('backup evironment completed !')
120
 
121
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}
@@ -136,6 +225,8 @@ def train(opt):
136
  if prev_opt.get(opt_name) != vars(opt).get(opt_name):
137
  logger.info('Change opt {} : {} --> {}'.format(opt_name, prev_opt.get(opt_name),
138
  vars(opt).get(opt_name)))
 
 
139
  if len(opt.visual_feature_folder) == 2:
140
  train_dataset_1 = PropSeqDataset(opt.train_caption_file[0],
141
  [opt.visual_feature_folder[0]],
@@ -195,7 +286,6 @@ def train(opt):
195
  model.train()
196
 
197
  # try to load saved pbox
198
- saved_path = construct_save_path(opt)
199
  if os.path.exists(saved_path):
200
  try:
201
  with open(saved_path, 'r') as f:
@@ -322,10 +412,11 @@ def train(opt):
322
  # if dt['video_key'][0] != 'LGArj9Do0xc':
323
  # continue
324
  # # for fast debugging
325
- # if trained_samples > 5:
326
- # break
327
- # else:
328
- # trained_samples += 1
 
329
  # if trained_samples < 1714:
330
  # trained_samples += 1
331
  # continue
@@ -486,7 +577,7 @@ def train(opt):
486
 
487
  epoch += 1
488
 
489
- if epoch == 1 and model.pseudo_boxes is not None and 'hyper' not in opt.train_caption_file[0]:
490
  # save the pseudo boxes
491
  pbox_save_path = construct_save_path(opt)
492
  if not os.path.exists(pbox_save_path):
 
48
  if len(opt.train_caption_file) == 2:
49
  if 'puyu' in opt.train_caption_file[0]:
50
  elements.append('howto_puyu')
51
+ elif 'mixlm' in opt.train_caption_file[0]:
52
+ elements.append('howto_mixlm')
53
  else:
54
  elements.append('howto_llama2')
55
  elements.append('howto')
 
65
  elif 'howto' in opt.train_caption_file:
66
  if 'puyu' in opt.train_caption_file:
67
  elements.append('howto_puyu')
68
+ elif 'mixlm' in opt.train_caption_file:
69
+ elements.append('howto_mixlm')
70
  else:
71
  elements.append('howto_llama2')
72
  # elements.append('howto')
 
114
  logger = create_logger(save_folder, 'train.log')
115
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
116
 
117
+ # if use mixlm model
118
+ saved_path = construct_save_path(opt)
119
+
120
+ if 'mixlm' in saved_path:
121
+ # text_feature_folder_mixlm = os.path.join(save_folder, 'text_feature')
122
+ mixlm_pbox_path = construct_save_path(opt, save_folder='test').replace('.json', '').replace('test/', '')
123
+ text_feature_folder_mixlm = os.path.join('/mnt/data/Gvlab/wuhao/code/tmp', 'mix_text_feature', mixlm_pbox_path)
124
+ os.makedirs(text_feature_folder_mixlm, exist_ok=True)
125
+ if 'clip' in save_folder or 'CLIP' in save_folder:
126
+ text_feature_folder_llama2 = map_path('/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip/text_proj')
127
+ text_feature_folder_puyu = '/mnt/data/Gvlab/wuhao/code/clip_frame_feature_extraction/features/howto100m/clip'
128
+ elif 'univl' in save_folder or 'UniVL' in save_folder or 'Uni' in save_folder:
129
+ text_feature_folder_llama2 = map_path('/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/UniVL/text')
130
+ text_feature_folder_puyu = '/mnt/data/pjlab-3090-sport/wuhao/features/howto100m/univl_features/text_puyu'
131
+
132
+ if not os.path.exists(saved_path):
133
+ llama2_pbox_path = saved_path.replace('mixlm', 'llama2')
134
+ puyu_pbox_path = saved_path.replace('mixlm', 'puyu')
135
+ with open(llama2_pbox_path, 'r') as f:
136
+ llama2_pbox = json.load(f)
137
+ with open(puyu_pbox_path, 'r') as f:
138
+ puyu_pbox = json.load(f)
139
+
140
+ mixlm_pbox = {}
141
+ for video_key in llama2_pbox.keys():
142
+ if llama2_pbox.get(video_key) is None and puyu_pbox.get(video_key) is None:
143
+ mixlm_pbox[video_key] = None
144
+ elif llama2_pbox.get(video_key) is None:
145
+ mixlm_pbox[video_key] = {'box': puyu_pbox[video_key]['box'], 'loss': puyu_pbox[video_key]['loss'], 'llm': 'puyu'}
146
+ elif puyu_pbox.get(video_key) is None:
147
+ mixlm_pbox[video_key] = {'box': llama2_pbox[video_key]['box'], 'loss': llama2_pbox[video_key]['loss'], 'llm': 'llama2'}
148
+ else:
149
+ if llama2_pbox[video_key]['loss'] < puyu_pbox[video_key]['loss']:
150
+ mixlm_pbox[video_key] = {'box': llama2_pbox[video_key]['box'], 'loss': llama2_pbox[video_key]['loss'], 'llm': 'llama2'}
151
+ else:
152
+ mixlm_pbox[video_key] = {'box': puyu_pbox[video_key]['box'], 'loss': puyu_pbox[video_key]['loss'], 'llm': 'puyu'}
153
+ with open(saved_path, 'w') as f:
154
+ json.dump(mixlm_pbox, f)
155
+
156
+ with open(saved_path, 'r') as f:
157
+ mixlm_pbox = json.load(f)
158
+ with open('data/howto/captiondata/howto100m_train_puyu.json', 'r') as f:
159
+ meta_puyu = json.load(f)
160
+ with open('data/howto/captiondata/howto100m_train.json', 'r') as f:
161
+ meta_llama2 = json.load(f)
162
+
163
+ meta_mixlm = {}
164
+ for video_key in mixlm_pbox.keys():
165
+ if mixlm_pbox.get(video_key) is not None and (meta_llama2.get(video_key) is not None or meta_puyu.get(video_key) is not None):
166
+ if mixlm_pbox[video_key]['llm'] == 'llama2':
167
+ meta_mixlm[video_key] = meta_llama2[video_key]
168
+ llama2_feature_path = os.path.join(text_feature_folder_llama2, video_key + '.npy')
169
+ if not os.path.exists(llama2_feature_path):
170
+ continue
171
+ # if os.path.exists(llama2_feature_path):
172
+ # os.unlink(llama2_feature_path)
173
+ # if not os.path.exists(llama2_feature_path):
174
+ # os.symlink(llama2_feature_path, os.path.join(text_feature_folder_mixlm, video_key + '.npy'))
175
+ soft_link_path = os.path.join(text_feature_folder_mixlm, video_key + '.npy')
176
+ # if os.path.exists(soft_link_path):
177
+ # os.unlink(soft_link_path)
178
+ if not os.path.exists(soft_link_path):
179
+ # print(os.path.exists(soft_link_path), os.path.exists(llama2_feature_path))
180
+ os.symlink(llama2_feature_path, soft_link_path)
181
+ # text_feature = np.load(llama2_feature_path)
182
+ # if text_feature.shape[0] != len(meta_llama2[video_key]['sentences']):
183
+ # print(f"{video_key} has {text_feature.shape[0]} sentences, but {len(meta_llama2[video_key]['sentences'])} sentences found in meta file")
184
+ else:
185
+ meta_mixlm[video_key] = meta_puyu[video_key]
186
+ puyu_feature_path = os.path.join(text_feature_folder_puyu, video_key + '.npy')
187
+ if not os.path.exists(puyu_feature_path):
188
+ continue
189
+
190
+ soft_link_path = os.path.join(text_feature_folder_mixlm, video_key + '.npy')
191
+
192
+ # if os.path.exists(soft_link_path):
193
+ # os.unlink(soft_link_path)
194
+ if not os.path.exists(soft_link_path):
195
+ os.symlink(puyu_feature_path, soft_link_path)
196
+ # text_feature = np.load(puyu_feature_path)
197
+ # if text_feature.shape[0] != len(meta_puyu[video_key]['sentences']):
198
+ # print(f"{video_key} has {text_feature.shape[0]} sentences, but {len(meta_puyu[video_key]['sentences'])} sentences found in meta file")
199
+ with open(os.path.join(save_folder, 'train_caption_mixlm.json'), 'w') as f:
200
+ json.dump(meta_mixlm, f)
201
+ opt.train_caption_file[0] = os.path.join(save_folder, 'train_caption_mixlm.json')
202
+ opt.text_feature_folder[0] = text_feature_folder_mixlm
203
+ # pass
204
+
205
+
206
  if not opt.start_from:
207
+ backup_envir(save_folder, opt)
208
  logger.info('backup evironment completed !')
209
 
210
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}
 
225
  if prev_opt.get(opt_name) != vars(opt).get(opt_name):
226
  logger.info('Change opt {} : {} --> {}'.format(opt_name, prev_opt.get(opt_name),
227
  vars(opt).get(opt_name)))
228
+ print(opt.text_feature_folder)
229
+ print(opt.train_caption_file)
230
  if len(opt.visual_feature_folder) == 2:
231
  train_dataset_1 = PropSeqDataset(opt.train_caption_file[0],
232
  [opt.visual_feature_folder[0]],
 
286
  model.train()
287
 
288
  # try to load saved pbox
 
289
  if os.path.exists(saved_path):
290
  try:
291
  with open(saved_path, 'r') as f:
 
412
  # if dt['video_key'][0] != 'LGArj9Do0xc':
413
  # continue
414
  # # for fast debugging
415
+ if opt.test:
416
+ if trained_samples > 5:
417
+ break
418
+ else:
419
+ trained_samples += 1
420
  # if trained_samples < 1714:
421
  # trained_samples += 1
422
  # continue
 
577
 
578
  epoch += 1
579
 
580
+ if epoch == 1 and model.pseudo_boxes is not None and 'mixlm' not in opt.train_caption_file[0]:
581
  # save the pseudo boxes
582
  pbox_save_path = construct_save_path(opt)
583
  if not os.path.exists(pbox_save_path):
backup/train_fewshot.py CHANGED
@@ -68,7 +68,7 @@ def train(opt):
68
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
69
 
70
  if not opt.start_from:
71
- backup_envir(save_folder)
72
  logger.info('backup evironment completed !')
73
 
74
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}
 
68
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
69
 
70
  if not opt.start_from:
71
+ backup_envir(save_folder, opt)
72
  logger.info('backup evironment completed !')
73
 
74
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}
backup/train_ft2_gt.py CHANGED
@@ -147,7 +147,7 @@ def train(opt):
147
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
148
 
149
  if not opt.start_from:
150
- backup_envir(save_folder)
151
  logger.info('backup evironment completed !')
152
 
153
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}
@@ -190,16 +190,17 @@ def train(opt):
190
  # train_dataset.translator = train_dataset_1.translator
191
 
192
  else:
193
- print('the script only support two dataset for pretrain and target task respectively')
194
- exit(1)
195
  train_dataset_target = PropSeqDataset(opt.train_caption_file,
196
  opt.visual_feature_folder,
197
  opt.text_feature_folder,
198
  opt.dict_file, True, 'gt',
199
  opt)
200
- train_loader_target = DataLoader(train_dataset_target, batch_size=opt.batch_size,
 
201
  shuffle=True, num_workers=opt.nthreads, collate_fn=collate_fn, worker_init_fn=_init_fn)
202
- train_dataloaders = [train_loader_target]
203
 
204
  # val_dataset = PropSeqDataset(opt.val_caption_file,
205
  # opt.visual_feature_folder,
 
147
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
148
 
149
  if not opt.start_from:
150
+ backup_envir(save_folder, opt)
151
  logger.info('backup evironment completed !')
152
 
153
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}
 
190
  # train_dataset.translator = train_dataset_1.translator
191
 
192
  else:
193
+ # print('the script only support two dataset for pretrain and target task respectively')
194
+ # exit(1)
195
  train_dataset_target = PropSeqDataset(opt.train_caption_file,
196
  opt.visual_feature_folder,
197
  opt.text_feature_folder,
198
  opt.dict_file, True, 'gt',
199
  opt)
200
+ subset_data = PercentageSubsetDataset(train_dataset_target, opt.ft_gt_percent)
201
+ train_loader_target = DataLoader(subset_data, batch_size=opt.batch_size,
202
  shuffle=True, num_workers=opt.nthreads, collate_fn=collate_fn, worker_init_fn=_init_fn)
203
+ # train_dataloaders = [train_loader_target]
204
 
205
  # val_dataset = PropSeqDataset(opt.val_caption_file,
206
  # opt.visual_feature_folder,
backup/train_pre_ft_gt.py CHANGED
@@ -45,7 +45,7 @@ import copy
45
  a100_folder = ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/Tasty/features', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Tasty/UniVL_feature', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features']
46
  r3090_folder = ['/mnt/data/Gvlab/wuhao/features/yc2', '/mnt/data/Gvlab/wuhao/features/tasty', '/mnt/data/Gvlab/wuhao/features/tasty/univl', '/mnt/data/Gvlab/wuhao/features/anet', '/mnt/data/Gvlab/wuhao/features/howto100m']
47
 
48
- pretrain_data_mode = 'single' # 'mix' or 'seq' or 'single'
49
 
50
  # /cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features -> /mnt/data/Gvlab/wuhao/features/howto100m
51
  # /cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip -> /mnt/data/Gvlab/wuhao/features/howto100m/clip_features
@@ -122,7 +122,7 @@ def train(opt):
122
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
123
 
124
  if not opt.start_from:
125
- backup_envir(save_folder)
126
  logger.info('backup evironment completed !')
127
 
128
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}
 
45
  a100_folder = ['/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/youcook2', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/Tasty/features', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Tasty/UniVL_feature', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/huabin/dataset/Anet', '/cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features']
46
  r3090_folder = ['/mnt/data/Gvlab/wuhao/features/yc2', '/mnt/data/Gvlab/wuhao/features/tasty', '/mnt/data/Gvlab/wuhao/features/tasty/univl', '/mnt/data/Gvlab/wuhao/features/anet', '/mnt/data/Gvlab/wuhao/features/howto100m']
47
 
48
+ pretrain_data_mode = 'mix' # 'mix' or 'seq' or 'single'
49
 
50
  # /cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features -> /mnt/data/Gvlab/wuhao/features/howto100m
51
  # /cpfs01/shared/Gvlab-A100/Gvlab-A100_hdd/wuhao/howto100m/features/clip -> /mnt/data/Gvlab/wuhao/features/howto100m/clip_features
 
122
  tf_writer = SummaryWriter(os.path.join(save_folder, 'tf_summary'))
123
 
124
  if not opt.start_from:
125
+ backup_envir(save_folder, opt)
126
  logger.info('backup evironment completed !')
127
 
128
  saved_info = {'best': {}, 'last': {}, 'history': {}, 'eval_history': {}}