File size: 2,606 Bytes
9cea7bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
run_id: SimplerEnv
run_root_dir: checkpoints
seed: 42
trackers:
- json
is_debug: false
framework:
  name: VLA_JEPA
  qwenvl:
    base_vlm: /home/dataset-local/models/Qwen3-VL-2B-Instruct
    attn_implementation: flash_attention_2
    vl_hidden_dim: 2048
  action_model:
    action_model_type: DiT-B
    action_hidden_dim: 1024
    hidden_size: 1024
    add_pos_embed: true
    max_seq_len: 1024
    action_dim: 7
    state_dim: 8
    future_action_window_size: 6
    action_horizon: 7
    past_action_window_size: 0
    repeated_diffusion_steps: 8
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_timestep_buckets: 1000
    num_inference_timesteps: 4
    num_target_vision_tokens: 32
    diffusion_model_cfg:
      cross_attention_dim: 2048
      dropout: 0.2
      final_dropout: true
      interleave_self_attention: true
      norm_type: ada_norm
      num_layers: 16
      output_dim: 1024
      positional_embeddings: null
  vj2_model:
    base_encoder: /home/dataset-local/models/vjepa2-vitl-fpc64-256
    depth: 12
    num_heads: 8
    special_action_token: <|action_{}|>
    num_action_tokens_per_timestep: 8
    embodied_action_token: <|embodied_action|>
    num_embodied_action_tokens_per_instruction: 32
    num_frames: 8
  reduce_in_full_precision: true
datasets:
  vla_data:
    dataset_py: lerobot_datasets
    data_root_dir: /home/dataset-local/datasets/LeRobot/OXE_LEROBOT_DATASET
    data_mix: bridge_rt_1
    action_type: delta_ee
    CoT_prompt: Your task is {instruction}. Infer the temporal dynamics from frames
      {actions} and produce the corresponding policy actions {e_actions}.
    resolution_size: 224
    video_resolution_size: 256
    per_device_batch_size: 32
    load_all_data_for_training: true
    action_horizon: 7
    with_state: false
trainer:
  epochs: 100
  max_train_steps: 30000
  num_warmup_steps: 5000
  save_interval: 10000
  eval_interval: 100
  learning_rate:
    base: 3.0e-05
    qwen_vl_interface: 1.0e-05
    action_model: 0.0001
    vj_predictor: 0.0005
  lr_scheduler_type: cosine_with_min_lr
  scheduler_specific_kwargs:
    min_lr: 1.0e-05
  freeze_modules: ''
  loss_scale:
    vla: 1.0
    vlm: 0.1
  max_grad_norm: 1.0
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 10
  gradient_clipping: 1.0
  gradient_accumulation_steps: 1
  optimizer:
    name: AdamW
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  is_resume: false
  resume_epoch: null
  resume_step: null
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true
output_dir: checkpoints/SimplerEnv