File size: 2,493 Bytes
81bdb87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
algorithm:
  _target_: gengaze.algorithms.GRPO
  group_size: 1
  std_normalize: false
  discount_factor: 1
  has_loss_on_eos_tokens: false
  optimize_task_loss_prediction: false
dataset:
  _target_: gengaze.datasets.video_folder.VideoFolder
  root: /home/baifengs/baifengs/data/gengaze/100DoH_res448_250K,/home/baifengs/baifengs/data/gengaze/Ego4D_res448_250K,/home/baifengs/baifengs/data/gengaze/InternVid_res448_250K,/home/baifengs/baifengs/data/gengaze/scanning_SAM_res448_50K,/home/baifengs/baifengs/data/gengaze/scanning_idl_res448_50K
  clip_len: 16
  frame_sample_rate: 1
  gt_gazing_pos_paths:
    train: null
    val: null
  random_sample_frame: false
model:
  _target_: gengaze.models.video_random_gaze.VideoRandomGaze
  gazing_ratio_config:
    sample_strategy: exponential
    fixed:
      gazing_ratio: 0.5
    uniform:
      gazing_ratio_min: 0
      gazing_ratio_max: 1
    exponential:
      gazing_ratio_min: 0.02
      gazing_ratio_max: 0.15
      lambda: 10
  gazing_ratio_each_frame_config:
    sample_strategy: dirichlet
    dirichlet:
      alpha: 10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
  gazing_prob_each_scale_config:
    sample_strategy: dirichlet
    dirichlet:
      alpha: 0.5
  scales: 32+64+112+224
  num_vision_tokens_each_frame: 265
  frame_sampling_rate: 1
task:
  _target_: gengaze.tasks.video_mae_reconstruction.VideoMAEReconstruction
  recon_model: facebook/vit-mae-large
  recon_sample_rate: 0.125
  attn_mode: flash_attention_2
  recon_model_config:
    scale_embed: true
    max_num_frames: 256
    time_embed: true
    causal: true
    loss_type: l1+dinov2_reg+siglip2
    loss_weights: 1+0.3+0.3
    l1_loss_config: null
    dinov2_reg_loss_config:
      model: facebook/dinov2-with-registers-base
    siglip2_loss_config:
      model: google/siglip2-base-patch16-224
  scales: 32+64+112+224
trainer:
  _target_: gengaze.trainer.Trainer
  batch_size: 512
  per_gpu_max_batch_size: 8
  lr: 0.0002
  min_lr: 1.0e-05
  lr_schedule: linear_w_warmup
  optimizer: adam
  train_gaze: false
  train_task: true
  train_w_ntp: false
  val_nsteps: 2000
  n_epochs: 50
  logdir: exps/
  exp_name_prefix: ''
  exp_name: '250819_1751'
  exp_name_suffix: ''
  resume: auto
  gaze_weights: null
  task_weights: null
  seed: 666
  val_only: false
  temp_schedule_args:
    mode: exp
    exp:
      temp_start: 10000.0
      temp_end: 1.0
    neg_cosine:
      temp_min: 1.0
      temp_max: 10000.0
      num_period: 1
  val_args:
    sample_gaze_for_reconstruction_oracle: 0