File size: 2,084 Bytes
643312c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
{
  "vla_name": "VITRA_Paligemma",
  "task_name": "pretrain",
  "model": "vitra_paligemma2",
  "fwd_pred_next_n": 16,
  "seed": 42,
  "batch_size": 64,
  "output_root": "/data/vla_checkpoint/vitra_vla_3b/checkpoints",
  "log_root": "/data/vla_checkpoint/vitra_vla_3b/logs",
  "cache_root": "/data/vla_checkpoint/vitra_vla_3b/cache/vitra_paligemma2",
  "model_load_path": null,
  "resume": true,
  "wandb_project": "vitra_paligemma2_humanpretrain",
  "wandb_entity": "",
  "save_steps": 5000,
  "total_batch_size": 512,
  "use_bf16": true,
  "use_fov": true,
  "untied_cognition_token": true,
  "use_state": "DiT",
  "loss_type": "human",
  "train_setup": {
    "freeze_option": "freeze_vision_encoder"
  },
  "state_encoder": {
    "state_dim": 212
  },
  "action_model": {
    "model_type": "DiT-B",
    "token_size": 2304,
    "action_dim": 192,
    "hidden_size": 1024
  },
  "vlm": {
    "type": "PaliGemmaForConditionalGeneration",
    "name": "paligemma",
    "pretrained_model_name_or_path": "google/paligemma2-3b-mix-224"
  },
  "trainer": {
    "sharding_strategy": "shard-grad-op",
    "strategy": "fsdp_paligemma_with_checkpointing",
    "lr_scheduler_type": "backbone-freeze-warmup",
    "gradient_clip_val": 1.0,
    "learning_rate": 1e-05,
    "weight_decay": 0.1,
    "max_epochs": 100000,
    "max_steps": 2000000,
    "reduce_in_full_precision": true,
    "enable_mixed_precision_training": false,
    "enable_gradient_checkpointing": true,
    "action_model_learning_rate": 0.0001,
    "llm_freeze_step": 5000,
    "warmup_ratio": null
  },
  "train_dataset": {
    "data_root_dir": "/data/VITRA_1M",
    "augmentation": true,
    "set_none_ratio": 0.0,
    "data_mix": "magic_mix",
    "num_workers": 18,
    "prefetch_factor": null,
    "flip_augmentation": 1.0,
    "action_type": "angle",
    "use_rel": false,
    "clip_len": 2000,
    "normalization": true,
    "state_mask_prob": 0.1
  },
  "repeated_diffusion_steps": 8,
  "config": "vitra/configs/human_pretrain.json",
  "data_mix": null,
  "debug": false,
  "num_workers": null,
  "prefetch_factor": null
}