File size: 6,560 Bytes
bfceca3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e775a3
bfceca3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e775a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
data:
  name: libero_cosmos_policy
  type: mg
  backend: robomimic
  paths: []
  task_suite_name: null
  observations_keys:
  - image
  observation_source_keys:
    image: agentview_rgb_jpeg
  action_dim: 7
  frame_stack: 1
  horizon: 1
  temporal_index_mode: clip
  use_proprio: false
  proprio_source: robot_states
  image_chw:
  - 3
  - 224
  - 224
  image_value_range: zero_to_one
  image_transport_dtype: uint8
  duration_focus: null
  hf:
    repo_id: nvidia/LIBERO-Cosmos-Policy
    repo_type: dataset
    allow_patterns:
    - success_only/*_regen/*.hdf5
    local_files_only: false
  action_sequence_targets:
    enabled: true
    horizons:
    - 20
    target_key: gt_action_seq_max
train:
  optimizer:
    type: adamw
    lr: 0.0001
    weight_decay: 1.0e-06
  batch_size: 256
  num_workers: 4
  prefetch_factor: null
  log_interval: 100
  total_steps: 5000
  eval_interval: 500
  use_aug: true
  aug_mode: default
  use_amp: true
  device_transfer_non_blocking: true
  seed: 42
  scheduler:
    type: cosine
    warmup_steps: 500
    num_cycles: 0.5
    min_lr_scale: 0.0
  decoder_dataloader:
    enabled: false
    batch_size: 64
    num_workers: 4
    prefetch_factor: null
    shuffle: true
  stages:
    stage1: true
    stage2: false
    stage3: false
  stage1:
    batch_size: 256
    num_workers: 16
    prefetch_factor: 2
    log_interval: 100
    total_steps: 20000
    eval_interval: 500
    optimizer:
      lr: 0.0001
      weight_decay: 1.0e-06
  stage2:
    batch_size: 256
    num_workers: 4
    prefetch_factor: null
    log_interval: 100
    total_steps: 5000
    eval_interval: 500
    latent_target: auto
    optimizer:
      lr: 0.0001
      weight_decay: 1.0e-06
    action_probe:
      enabled: true
  stage3:
    batch_size: 256
    num_workers: 4
    prefetch_factor: null
    log_interval: 100
    total_steps: 5000
    eval_interval: 500
    final_eval_num_episodes: 100
    log_eval_video: false
    optimizer:
      lr: 0.0001
      weight_decay: 1.0e-06
    trainable_scope: all
    non_decoder_lr_scale: 1.0
  checkpoint:
    enabled: true
    base_dir: null
    save_interval: 1000
    save_last: true
    latest_only: true
    load:
      stage1:
        mode: none
        path: null
      stage2:
        mode: handoff
        path: null
        teacher_path: null
      stage3:
        mode: handoff
        path: null
model:
  idm:
    type: token_idm
    action_dim: 128
    token_dim: 768
    model_dim: 512
    latent_dim: 32
    num_action_tokens: 4
    num_blocks: 4
    num_heads: 8
    dropout: 0.0
  fdm:
    type: token_fdm_duration
    action_dim: 128
    token_dim: 768
    model_dim: 512
    num_action_tokens: 4
    num_blocks: 4
    num_heads: 8
    dropout: 0.0
  hyperbolic_latent:
    enabled: false
    backend: geoopt
    manifold: poincare
    curvature: 1.0
    learnable_curvature: false
    fdm_input_mode: logmap0
    prelift_mode: none
    prelift_scale: 1.0
    lift_max_norm: 5.0
    tangent_max_norm: 5.0
    eps: 1.0e-06
  encoders:
    type: dino
    image_key: image
    model_id: facebook/dinov2-base
    input_value_range: zero_to_one
    freeze_backbone: true
    drop_cls_token: true
    output_dim: 768
    mean:
    - 0.485
    - 0.456
    - 0.406
    std:
    - 0.229
    - 0.224
    - 0.225
  type: dino_lam
  idm_input: future
  fdm_target: future
  pixel_decoders: null
  latent_action_decoders: null
objective:
  fdm_target: future
  idm_input: future
  stage_overrides: {}
  multiscale:
    enabled: true
    consistency:
      enabled: false
      weight: 10.0
      num_pairs: 4
      sample_source: all_horizons
      teacher_mode: direct_teacher
    prediction_mode: direct_duration
    allow_plain_fdm_oneshot: true
    weight_mode: uniform
    weights: {}
  temporal:
    max_offset: 20
    anchor_mode: fixed
    extra_random_count: 4
  branch_order:
    enabled: false
    radial_weight: 0.0
    local_radial_margin_weight: 0.0
    local_radial_margin_alpha: 0.05
    branch_weight: 0.0
    z0_origin_weight: 0.0
    prefix_weight: 0.0
    radius_progress_weight: 1.0
    radius_progress_mode: offset_margin
    radius_progress_alpha: 0.02
    branch_margin_deg: 10.0
    eps: 1.0e-06
  latent_plan:
    enabled: false
    total_horizon: 20
eval:
  type: robomimic
  data_path: []
  reset_mode: env_reset
  num_eval_episodes: 20
  max_steps: 500
  record_video: true
  checkpoint_path: null
  checkpoint_strict: true
  use_checkpoint_cfg: true
  video_output_path: null
  obs_keys: null
  use_proprio: null
  use_object: false
  resize_hw: null
  image_value_range: null
  seed: 0
agent:
  encoders:
    image:
      in_channels: 3
      output_dim: 512
      output_mode: global
      type: resnet18
      pretrained: false
    proprio:
      input_dim: 9
      hidden_dim: 128
      output_dim: 64
    type: group
    modalities:
    - agentview_image
    - robot0_eye_in_hand_image
    proj_dim: 128
  policies:
    decoder:
      type: mlp
      hidden_dims:
      - 256
      - 256
    hidden_dims:
    - 512
    - 512
    - 256
    - 64
    action_dim: 256
    emb_dim: 384
    gt_action_dim: 7
  type: latent_action
probes:
  enabled: true
  every: 10
  steps_per_call: 1
  sequence:
    enabled: true
    horizons:
    - 20
    target_key: gt_action_seq_max
  list:
    z_to_s_t:
      name: z_to_s_t
      enabled: false
      shuffle: true
      type: regression
      input: z_t
      target: s_t
      loss: mse
      lr: 0.001
      every: 10
      mlp:
        hidden_dims:
        - 128
        - 64
    z_to_s_tp:
      name: z_to_s_tp
      enabled: false
      shuffle: true
      type: regression
      input: z_t
      target: s_tp
      loss: mse
      lr: 0.001
      every: 10
      mlp:
        hidden_dims:
        - 128
        - 64
    z_to_action_t:
      name: z_to_action_t
      probe_type: z_to_action
      enabled: true
      shuffle: true
      type: regression
      input: z_t
      target: gt_action
      loss: mse
      lr: 0.001
      every: 1
      mlp:
        hidden_dims:
        - 128
        - 64
    z_to_action_seq_h20:
      name: z_to_action_seq_h20
      probe_type: z_to_action_sequence
      enabled: true
      shuffle: true
      type: regression
      input: z_t
      target: gt_action_seq_max
      sequence_horizon: 20
      loss: mse
      lr: 0.001
      every: 10
      mlp:
        hidden_dims:
        - 128
        - 64
      output_dim: 140
logger:
  project: latent_action
  run_name: latent_action_training
  tags:
  - debug
output_root: outputs/token_fdm_duration/dino_suite_all_k20_extra4