Wr3ck1Am commited on
Commit
ba3fb02
·
verified ·
1 Parent(s): f2dc91e

Add files using upload-large-folder tool

Browse files
README.md CHANGED
@@ -54,13 +54,13 @@ restructure the loop around an outer epoch loop.
54
  ## Files
55
 
56
  ```
57
- config.json — full run config (resolved CLI + defaults)
58
  config.yaml — same, yaml flavor
59
  dataset_statistics.json — action mean/std over training split (REQUIRED for inference unnorm)
60
  run-metrics.jsonl — early run metadata
61
  memoryvla_realpushmultit_lora_bs64_v1.jsonl — per-step train metrics
62
  checkpoints/
63
- step-001000-epoch-00-loss=0.1240.pt — 32 GB, FSDP full-shard consolidated, "only_trainable" save (LoRA + modules_to_save)
64
  step-002000-epoch-01-loss=0.0893.pt
65
  step-003000-epoch-01-loss=0.0755.pt
66
  step-004000-epoch-02-loss=0.0635.pt
@@ -68,10 +68,41 @@ checkpoints/
68
  step-006000-epoch-03-loss=0.0703.pt
69
  ```
70
 
71
- Each ckpt is ~32 GB; load via the MemoryVLA `load_vla(...)` path with the
72
- same prismatic base config and apply the same LoRA wrap before loading state
73
- dict. See `train_memoryvla_realpushmultit.py` for the resume code path
74
- (`--is_resume True --resume_step <step> --resume_epoch <epoch>`).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  ## Reproduce
77
 
 
54
  ## Files
55
 
56
  ```
57
+ config.json — full run config (resolved CLI + defaults; base_vlm fixed to prism-dinosiglip-224px+7b)
58
  config.yaml — same, yaml flavor
59
  dataset_statistics.json — action mean/std over training split (REQUIRED for inference unnorm)
60
  run-metrics.jsonl — early run metadata
61
  memoryvla_realpushmultit_lora_bs64_v1.jsonl — per-step train metrics
62
  checkpoints/
63
+ step-001000-epoch-00-loss=0.1240.pt — 32 GB, merged: LoRA deltas folded into base weights, flat state_dict keys
64
  step-002000-epoch-01-loss=0.0893.pt
65
  step-003000-epoch-01-loss=0.0755.pt
66
  step-004000-epoch-02-loss=0.0635.pt
 
68
  step-006000-epoch-03-loss=0.0703.pt
69
  ```
70
 
71
+ ## Loading
72
+
73
+ Each ckpt has been **merged** LoRA adapter weights (PEFT LLaMA + SigLIP,
74
+ our LoRALinear on DiT-L qkv / CogMem cross / GateFusion, custom MHA-LoRA on
75
+ DiT per_attn) are folded into the corresponding base weights with the
76
+ scaling factor `α/r` applied, then the wrap keys (`base_layer.weight`,
77
+ `lora_A`, `lora_B`, `base_model.model.` prefix) are dropped. The resulting
78
+ state-dict matches a fresh, non-LoRA-wrapped MemoryVLA model 1-for-1, so
79
+ `load_vla(...)` loads cleanly with `strict=True` and rollout / inference
80
+ needs no extra code:
81
+
82
+ ```python
83
+ import sys, pathlib
84
+ sys.path.insert(0, str(pathlib.Path("third_party/MemoryVLA").resolve()))
85
+ from vla import load_vla
86
+
87
+ vla = load_vla(
88
+ "checkpoints/step-006000-epoch-03-loss=0.0703.pt",
89
+ load_for_training=False,
90
+ action_model_type="DiT-L",
91
+ future_action_window_size=15,
92
+ past_action_window_size=0, action_dim=7,
93
+ mem_length=16, retrieval_layers=2, per_token_size=256,
94
+ fusion_type="gate", consolidate_type="tome",
95
+ ).to("cuda").to(torch.bfloat16).eval()
96
+ ```
97
+
98
+ To **resume training** from one of these, set `--is_resume True
99
+ --resume_step <step> --resume_epoch <epoch>`. `apply_memoryvla_lora` then
100
+ wraps the model again with fresh (zero-initialised) adapters; the merged
101
+ base carries the prior training's knowledge and new LoRA learns on top.
102
+
103
+ The original unmerged ckpts are not preserved (the merge is exact and
104
+ losslessly invertible only with the matching adapter shapes — see
105
+ `scripts/merge_lora_ckpt.py` for the merge logic).
106
 
107
  ## Reproduce
108
 
checkpoints/step-001000-epoch-00-loss=0.1240.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0b99d7d0d95961373f5a9b6ee59cbf183583f4aa57c9ecdd9e6f7ed8e5bb978
3
- size 33553162569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfed6137eb16709deb3f8196b7d4cbb69eb1c26cb1636afbc845c73f71fe8ecd
3
+ size 33507489313
checkpoints/step-002000-epoch-01-loss=0.0893.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b77029d5625dd6c30e6373c37e79cf022d7bc10a9412c421f694ec2ecd6e465
3
- size 33553162569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23abc7e40600f004663d3c7da5f1f9be007fde2715bb8407362cfd3aa9808420
3
+ size 33507489313
checkpoints/step-003000-epoch-01-loss=0.0755.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d617e5730c5c5328edb3b2039ea6186e377a9d65aa913c7629fa76f4678e1e2
3
- size 33553162569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cbfd2777ccf3a1054c8d56d6386a64d650fd99326534ccfcecd8cc49883a2ac
3
+ size 33507489313
checkpoints/step-004000-epoch-02-loss=0.0635.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be42f8d4c70f14a4418b165d742d40616eac714d2e54b617a0874ebd83ac5f43
3
- size 33553162569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14c7a257c63e5345798d8a73092308723937a604070bf7f70ba25f655baec773
3
+ size 33507489313
checkpoints/step-005000-epoch-03-loss=0.0768.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13a20ec24b09da7be97254f5f1c52c09e60955b86d2de0620b555da73006c68c
3
- size 33553162569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa7c773f90cade39d6b781fbed3d07fd21787fccb787c5302046a08227bb2f36
3
+ size 33507489313
checkpoints/step-006000-epoch-03-loss=0.0703.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce5e856e6303806a1f2fb69eba90200daa6df3313e4badb6a4df4357f09c93a0
3
- size 33553162569
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4c8429d76d04c64e27aa7be7e42134763b2b446ec0ca1afc93e0700b3782705
3
+ size 33507489313
config.json CHANGED
@@ -11,7 +11,7 @@
11
  "image_aug": false,
12
  "image_key": "img_third",
13
  "instruction": "push the T-shaped block to visit each T-shaped target region on the table without revisiting any",
14
- "is_resume": false,
15
  "lora": {
16
  "alpha": 48.0,
17
  "cog_cross_targets": [
@@ -43,10 +43,10 @@
43
  },
44
  "mem_length": 16,
45
  "per_token_size": 256,
46
- "pretrained_checkpoint": "/workspace/diffusion_policy/pretrained/openvla-7b-prismatic/checkpoints/step-295000-epoch-40-loss=0.2200.pt",
47
  "repeated_diffusion_steps": 4,
48
- "resume_epoch": 0,
49
- "resume_step": 0,
50
  "retrieval_layers": 2,
51
  "run_id": "memoryvla_realpushmultit_lora_bs64_v1",
52
  "run_id_note": null,
@@ -62,8 +62,8 @@
62
  "use_timestep_pe": true,
63
  "val_ratio": 0.05,
64
  "vla": {
65
- "base_vlm": "siglip-224px+7b",
66
- "data_mix": "bridge",
67
  "enable_gradient_checkpointing": true,
68
  "enable_mixed_precision_training": true,
69
  "epochs": 100,
@@ -77,11 +77,11 @@
77
  "max_steps": 10000,
78
  "per_device_batch_size": 64,
79
  "reduce_in_full_precision": true,
80
- "shuffle_buffer_size": 256000,
81
  "train_strategy": "fsdp-full-shard",
82
- "type": "siglip-224px+mx-bridge",
83
  "unfreeze_last_llm_layer": false,
84
- "vla_id": "siglip-224px+mx-bridge",
85
  "warmup_ratio": 0.05,
86
  "weight_decay": 0.0
87
  },
 
11
  "image_aug": false,
12
  "image_key": "img_third",
13
  "instruction": "push the T-shaped block to visit each T-shaped target region on the table without revisiting any",
14
+ "is_resume": true,
15
  "lora": {
16
  "alpha": 48.0,
17
  "cog_cross_targets": [
 
43
  },
44
  "mem_length": 16,
45
  "per_token_size": 256,
46
+ "pretrained_checkpoint": "/workspace/diffusion_policy/runs/memoryvla_realpushmultit/memoryvla_realpushmultit_lora_bs64_v1/checkpoints/step-006000-epoch-03-loss=0.0703.pt",
47
  "repeated_diffusion_steps": 4,
48
+ "resume_epoch": 3,
49
+ "resume_step": 6000,
50
  "retrieval_layers": 2,
51
  "run_id": "memoryvla_realpushmultit_lora_bs64_v1",
52
  "run_id_note": null,
 
62
  "use_timestep_pe": true,
63
  "val_ratio": 0.05,
64
  "vla": {
65
+ "base_vlm": "prism-dinosiglip-224px+7b",
66
+ "data_mix": "oxe_magic_soup_plus_minus",
67
  "enable_gradient_checkpointing": true,
68
  "enable_mixed_precision_training": true,
69
  "epochs": 100,
 
77
  "max_steps": 10000,
78
  "per_device_batch_size": 64,
79
  "reduce_in_full_precision": true,
80
+ "shuffle_buffer_size": 250000,
81
  "train_strategy": "fsdp-full-shard",
82
+ "type": "prism-dinosiglip-224px+oxe+diffusion",
83
  "unfreeze_last_llm_layer": false,
84
+ "vla_id": "prism-dinosiglip-224px+oxe+diffusion",
85
  "warmup_ratio": 0.05,
86
  "weight_decay": 0.0
87
  },
config.json.bak ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "action_dim": 7,
3
+ "action_model_type": "DiT-L",
4
+ "consolidate_type": "tome",
5
+ "dataset_name": "realpushmultit",
6
+ "episode_instructions_file": null,
7
+ "fusion_type": "gate",
8
+ "future_action_window_size": 15,
9
+ "group_size": 16,
10
+ "hf_token": ".hf_token",
11
+ "image_aug": false,
12
+ "image_key": "img_third",
13
+ "instruction": "push the T-shaped block to visit each T-shaped target region on the table without revisiting any",
14
+ "is_resume": true,
15
+ "lora": {
16
+ "alpha": 48.0,
17
+ "cog_cross_targets": [
18
+ "q_proj",
19
+ "k_proj",
20
+ "v_proj"
21
+ ],
22
+ "dit_attn_targets": [
23
+ "q",
24
+ "v"
25
+ ],
26
+ "dropout": 0.05,
27
+ "enabled": true,
28
+ "llama_alpha": 16.0,
29
+ "llama_r": 8,
30
+ "llama_targets": [
31
+ "q_proj",
32
+ "v_proj"
33
+ ],
34
+ "lora_cog_gate": true,
35
+ "lora_llama": true,
36
+ "lora_vision": true,
37
+ "r": 24,
38
+ "vision_alpha": 16.0,
39
+ "vision_r": 8,
40
+ "vision_targets": [
41
+ "qkv"
42
+ ]
43
+ },
44
+ "mem_length": 16,
45
+ "per_token_size": 256,
46
+ "pretrained_checkpoint": "/workspace/diffusion_policy/runs/memoryvla_realpushmultit/memoryvla_realpushmultit_lora_bs64_v1/checkpoints/step-006000-epoch-03-loss=0.0703.pt",
47
+ "repeated_diffusion_steps": 4,
48
+ "resume_epoch": 3,
49
+ "resume_step": 6000,
50
+ "retrieval_layers": 2,
51
+ "run_id": "memoryvla_realpushmultit_lora_bs64_v1",
52
+ "run_id_note": null,
53
+ "run_root_dir": "runs/memoryvla_realpushmultit",
54
+ "save_interval": 1000,
55
+ "seed": 42,
56
+ "trackers": [
57
+ "jsonl",
58
+ "wandb"
59
+ ],
60
+ "update_fused": false,
61
+ "use_ema": false,
62
+ "use_timestep_pe": true,
63
+ "val_ratio": 0.05,
64
+ "vla": {
65
+ "base_vlm": "siglip-224px+7b",
66
+ "data_mix": "bridge",
67
+ "enable_gradient_checkpointing": true,
68
+ "enable_mixed_precision_training": true,
69
+ "epochs": 100,
70
+ "expected_world_size": 4,
71
+ "freeze_llm_backbone": false,
72
+ "freeze_vision_backbone": false,
73
+ "global_batch_size": 256,
74
+ "learning_rate": 0.0002,
75
+ "lr_scheduler_type": "linear-warmup+cosine-decay",
76
+ "max_grad_norm": 1.0,
77
+ "max_steps": 10000,
78
+ "per_device_batch_size": 64,
79
+ "reduce_in_full_precision": true,
80
+ "shuffle_buffer_size": 256000,
81
+ "train_strategy": "fsdp-full-shard",
82
+ "type": "siglip-224px+mx-bridge",
83
+ "unfreeze_last_llm_layer": false,
84
+ "vla_id": "siglip-224px+mx-bridge",
85
+ "warmup_ratio": 0.05,
86
+ "weight_decay": 0.0
87
+ },
88
+ "wandb_entity": "williamcao-uc-san-diego",
89
+ "wandb_project": "memoryvla_realpushmultit_lora",
90
+ "zarr_path": "data/real_push_multit/RealPushMultiT_320.zarr"
91
+ }
config.yaml CHANGED
@@ -11,7 +11,7 @@ image_aug: false
11
  image_key: img_third
12
  instruction: push the T-shaped block to visit each T-shaped target region on the table
13
  without revisiting any
14
- is_resume: false
15
  lora:
16
  alpha: 48.0
17
  cog_cross_targets:
@@ -38,10 +38,10 @@ lora:
38
  - qkv
39
  mem_length: 16
40
  per_token_size: 256
41
- pretrained_checkpoint: /workspace/diffusion_policy/pretrained/openvla-7b-prismatic/checkpoints/step-295000-epoch-40-loss=0.2200.pt
42
  repeated_diffusion_steps: 4
43
- resume_epoch: 0
44
- resume_step: 0
45
  retrieval_layers: 2
46
  run_id: memoryvla_realpushmultit_lora_bs64_v1
47
  run_id_note: null
@@ -56,8 +56,8 @@ use_ema: false
56
  use_timestep_pe: true
57
  val_ratio: 0.05
58
  vla:
59
- base_vlm: siglip-224px+7b
60
- data_mix: bridge
61
  enable_gradient_checkpointing: true
62
  enable_mixed_precision_training: true
63
  epochs: 100
@@ -71,11 +71,11 @@ vla:
71
  max_steps: 10000
72
  per_device_batch_size: 64
73
  reduce_in_full_precision: true
74
- shuffle_buffer_size: 256000
75
  train_strategy: fsdp-full-shard
76
- type: siglip-224px+mx-bridge
77
  unfreeze_last_llm_layer: false
78
- vla_id: siglip-224px+mx-bridge
79
  warmup_ratio: 0.05
80
  weight_decay: 0.0
81
  wandb_entity: williamcao-uc-san-diego
 
11
  image_key: img_third
12
  instruction: push the T-shaped block to visit each T-shaped target region on the table
13
  without revisiting any
14
+ is_resume: true
15
  lora:
16
  alpha: 48.0
17
  cog_cross_targets:
 
38
  - qkv
39
  mem_length: 16
40
  per_token_size: 256
41
+ pretrained_checkpoint: /workspace/diffusion_policy/runs/memoryvla_realpushmultit/memoryvla_realpushmultit_lora_bs64_v1/checkpoints/step-006000-epoch-03-loss=0.0703.pt
42
  repeated_diffusion_steps: 4
43
+ resume_epoch: 3
44
+ resume_step: 6000
45
  retrieval_layers: 2
46
  run_id: memoryvla_realpushmultit_lora_bs64_v1
47
  run_id_note: null
 
56
  use_timestep_pe: true
57
  val_ratio: 0.05
58
  vla:
59
+ base_vlm: prism-dinosiglip-224px+7b
60
+ data_mix: oxe_magic_soup_plus_minus
61
  enable_gradient_checkpointing: true
62
  enable_mixed_precision_training: true
63
  epochs: 100
 
71
  max_steps: 10000
72
  per_device_batch_size: 64
73
  reduce_in_full_precision: true
74
+ shuffle_buffer_size: 250000
75
  train_strategy: fsdp-full-shard
76
+ type: prism-dinosiglip-224px+oxe+diffusion
77
  unfreeze_last_llm_layer: false
78
+ vla_id: prism-dinosiglip-224px+oxe+diffusion
79
  warmup_ratio: 0.05
80
  weight_decay: 0.0
81
  wandb_entity: williamcao-uc-san-diego
memoryvla_realpushmultit_lora_bs64_v1.jsonl CHANGED
@@ -6481,3 +6481,12 @@
6481
  {"VLA Train/Epoch": 4, "VLA Train/Learning Rate": 6.040664401219998e-05, "VLA Train/Loss": 0.05334499105811119, "VLA Train/Loss (Raw)": 0.05334499105811119, "VLA Train/Step": 6481, "VLA Train/Step Time": 5.510432720184326}
6482
  {"VLA Train/Epoch": 4, "VLA Train/Learning Rate": 6.0376279240174616e-05, "VLA Train/Loss": 0.04682043939828873, "VLA Train/Loss (Raw)": 0.04682043939828873, "VLA Train/Step": 6482, "VLA Train/Step Time": 6.023637056350708}
6483
  {"VLA Train/Epoch": 4, "VLA Train/Learning Rate": 6.034591880133977e-05, "VLA Train/Loss": 0.08309707045555115, "VLA Train/Loss (Raw)": 0.08309707045555115, "VLA Train/Step": 6483, "VLA Train/Step Time": 2.9199516773223877}
 
 
 
 
 
 
 
 
 
 
6481
  {"VLA Train/Epoch": 4, "VLA Train/Learning Rate": 6.040664401219998e-05, "VLA Train/Loss": 0.05334499105811119, "VLA Train/Loss (Raw)": 0.05334499105811119, "VLA Train/Step": 6481, "VLA Train/Step Time": 5.510432720184326}
6482
  {"VLA Train/Epoch": 4, "VLA Train/Learning Rate": 6.0376279240174616e-05, "VLA Train/Loss": 0.04682043939828873, "VLA Train/Loss (Raw)": 0.04682043939828873, "VLA Train/Step": 6482, "VLA Train/Step Time": 6.023637056350708}
6483
  {"VLA Train/Epoch": 4, "VLA Train/Learning Rate": 6.034591880133977e-05, "VLA Train/Loss": 0.08309707045555115, "VLA Train/Loss (Raw)": 0.08309707045555115, "VLA Train/Step": 6483, "VLA Train/Step Time": 2.9199516773223877}
6484
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 4.0000000000000003e-07, "VLA Train/Loss": 0.06522951275110245, "VLA Train/Loss (Raw)": 0.06522951275110245, "VLA Train/Step": 6001, "VLA Train/Step Time": 6.436929702758789}
6485
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 8.000000000000001e-07, "VLA Train/Loss": 0.06474851816892624, "VLA Train/Loss (Raw)": 0.06474851816892624, "VLA Train/Step": 6002, "VLA Train/Step Time": 5.487528562545776}
6486
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 1.2000000000000002e-06, "VLA Train/Loss": 0.06895651668310165, "VLA Train/Loss (Raw)": 0.06895651668310165, "VLA Train/Step": 6003, "VLA Train/Step Time": 5.5521111488342285}
6487
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 1.6000000000000001e-06, "VLA Train/Loss": 0.05894169583916664, "VLA Train/Loss (Raw)": 0.05894169583916664, "VLA Train/Step": 6004, "VLA Train/Step Time": 5.520312786102295}
6488
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 2.0000000000000003e-06, "VLA Train/Loss": 0.09873355180025101, "VLA Train/Loss (Raw)": 0.09873355180025101, "VLA Train/Step": 6005, "VLA Train/Step Time": 5.502042055130005}
6489
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 2.4000000000000003e-06, "VLA Train/Loss": 0.0857497826218605, "VLA Train/Loss (Raw)": 0.0857497826218605, "VLA Train/Step": 6006, "VLA Train/Step Time": 5.524176359176636}
6490
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 2.8000000000000003e-06, "VLA Train/Loss": 0.07324730604887009, "VLA Train/Loss (Raw)": 0.07324730604887009, "VLA Train/Step": 6007, "VLA Train/Step Time": 5.498190879821777}
6491
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 3.2000000000000003e-06, "VLA Train/Loss": 0.06469529122114182, "VLA Train/Loss (Raw)": 0.06469529122114182, "VLA Train/Step": 6008, "VLA Train/Step Time": 5.506431341171265}
6492
+ {"VLA Train/Epoch": 3, "VLA Train/Learning Rate": 3.6e-06, "VLA Train/Loss": 0.07000145316123962, "VLA Train/Loss (Raw)": 0.07000145316123962, "VLA Train/Step": 6009, "VLA Train/Step Time": 5.5216052532196045}
run-metrics.jsonl CHANGED
@@ -1 +1 @@
1
- {"hparams": {"action_dim": 7, "action_model_type": "DiT-L", "consolidate_type": "tome", "dataset_name": "realpushmultit", "episode_instructions_file": null, "fusion_type": "gate", "future_action_window_size": 15, "group_size": 16, "hf_token": ".hf_token", "image_aug": false, "image_key": "img_third", "instruction": "push the T-shaped block to visit each T-shaped target region on the table without revisiting any", "is_resume": false, "lora": {"alpha": 48.0, "cog_cross_targets": ["q_proj", "k_proj", "v_proj"], "dit_attn_targets": ["q", "v"], "dropout": 0.05, "enabled": true, "llama_alpha": 16.0, "llama_r": 8, "llama_targets": ["q_proj", "v_proj"], "lora_cog_gate": true, "lora_llama": true, "lora_vision": true, "r": 24, "vision_alpha": 16.0, "vision_r": 8, "vision_targets": ["qkv"]}, "mem_length": 16, "per_token_size": 256, "pretrained_checkpoint": "/workspace/diffusion_policy/pretrained/openvla-7b-prismatic/checkpoints/step-295000-epoch-40-loss=0.2200.pt", "repeated_diffusion_steps": 4, "resume_epoch": 0, "resume_step": 0, "retrieval_layers": 2, "run_id": "memoryvla_realpushmultit_lora_bs64_v1", "run_id_note": null, "run_root_dir": "runs/memoryvla_realpushmultit", "save_interval": 1000, "seed": 42, "trackers": ["jsonl", "wandb"], "update_fused": false, "use_ema": false, "use_timestep_pe": true, "val_ratio": 0.05, "vla": {"base_vlm": "siglip-224px+7b", "data_mix": "bridge", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "epochs": 100, "expected_world_size": 4, "freeze_llm_backbone": false, "freeze_vision_backbone": false, "global_batch_size": 256, "learning_rate": 0.0002, "lr_scheduler_type": "linear-warmup+cosine-decay", "max_grad_norm": 1.0, "max_steps": 10000, "per_device_batch_size": 64, "reduce_in_full_precision": true, "shuffle_buffer_size": 256000, "train_strategy": "fsdp-full-shard", "type": "siglip-224px+mx-bridge", "unfreeze_last_llm_layer": false, "vla_id": "siglip-224px+mx-bridge", "warmup_ratio": 0.05, "weight_decay": 0.0}, "wandb_entity": "williamcao-uc-san-diego", "wandb_project": "memoryvla_realpushmultit_lora", "zarr_path": "data/real_push_multit/RealPushMultiT_320.zarr"}, "run_id": "memoryvla_realpushmultit_lora_bs64_v1"}
 
1
+ {"hparams": {"action_dim": 7, "action_model_type": "DiT-L", "consolidate_type": "tome", "dataset_name": "realpushmultit", "episode_instructions_file": null, "fusion_type": "gate", "future_action_window_size": 15, "group_size": 16, "hf_token": ".hf_token", "image_aug": false, "image_key": "img_third", "instruction": "push the T-shaped block to visit each T-shaped target region on the table without revisiting any", "is_resume": true, "lora": {"alpha": 48.0, "cog_cross_targets": ["q_proj", "k_proj", "v_proj"], "dit_attn_targets": ["q", "v"], "dropout": 0.05, "enabled": true, "llama_alpha": 16.0, "llama_r": 8, "llama_targets": ["q_proj", "v_proj"], "lora_cog_gate": true, "lora_llama": true, "lora_vision": true, "r": 24, "vision_alpha": 16.0, "vision_r": 8, "vision_targets": ["qkv"]}, "mem_length": 16, "per_token_size": 256, "pretrained_checkpoint": "/workspace/diffusion_policy/runs/memoryvla_realpushmultit/memoryvla_realpushmultit_lora_bs64_v1/checkpoints/step-006000-epoch-03-loss=0.0703.pt", "repeated_diffusion_steps": 4, "resume_epoch": 3, "resume_step": 6000, "retrieval_layers": 2, "run_id": "memoryvla_realpushmultit_lora_bs64_v1", "run_id_note": null, "run_root_dir": "runs/memoryvla_realpushmultit", "save_interval": 1000, "seed": 42, "trackers": ["jsonl", "wandb"], "update_fused": false, "use_ema": false, "use_timestep_pe": true, "val_ratio": 0.05, "vla": {"base_vlm": "prism-dinosiglip-224px+7b", "data_mix": "oxe_magic_soup_plus_minus", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "epochs": 100, "expected_world_size": 4, "freeze_llm_backbone": false, "freeze_vision_backbone": false, "global_batch_size": 256, "learning_rate": 0.0002, "lr_scheduler_type": "linear-warmup+cosine-decay", "max_grad_norm": 1.0, "max_steps": 10000, "per_device_batch_size": 64, "reduce_in_full_precision": true, "shuffle_buffer_size": 250000, "train_strategy": "fsdp-full-shard", "type": "prism-dinosiglip-224px+oxe+diffusion", "unfreeze_last_llm_layer": false, "vla_id": "prism-dinosiglip-224px+oxe+diffusion", "warmup_ratio": 0.05, "weight_decay": 0.0}, "wandb_entity": "williamcao-uc-san-diego", "wandb_project": "memoryvla_realpushmultit_lora", "zarr_path": "data/real_push_multit/RealPushMultiT_320.zarr"}, "run_id": "memoryvla_realpushmultit_lora_bs64_v1"}