lihao3 commited on
Commit ·
dc495af
0
Parent(s):
Add model checkpoint
Browse files- .gitattributes +1 -0
- README.md +30 -0
- step2/README.md +23 -0
- step2/checkpoints/reshape_embedding_step2.pt +3 -0
- step2/config.json +52 -0
- step2/config.yaml +48 -0
- step2/dataset_statistics.json +252 -0
- step2/run-metrics.jsonl +1 -0
- step4/README.md +23 -0
- step4/checkpoints/reshape_embedding_step4.pt +3 -0
- step4/config.json +52 -0
- step4/config.yaml +48 -0
- step4/dataset_statistics.json +252 -0
- step4/run-metrics.jsonl +1 -0
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model:
|
| 3 |
+
- openvla/openvla-7b-prismatic
|
| 4 |
+
- CogACT/CogACT-Base
|
| 5 |
+
tags:
|
| 6 |
+
- robotics
|
| 7 |
+
- vla
|
| 8 |
+
- multimodal
|
| 9 |
+
- pretraining
|
| 10 |
+
pipeline_tag: robotics
|
| 11 |
+
---
|
| 12 |
+
# Model Card for CronusVLA
|
| 13 |
+
|
| 14 |
+
**Weights**
|
| 15 |
+
|
| 16 |
+
`step2/checkpoints/reshape_embedding_step2.pt`:
|
| 17 |
+
|
| 18 |
+
- (1) This checkpoint serves as the starting point for fine-tuning on the Libero benchmark.
|
| 19 |
+
- (2) It is a post-trained model derived from CronusVLA-7B, configured with a two-step frame (history length = 1).
|
| 20 |
+
- (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
|
| 21 |
+
- (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
|
| 22 |
+
|
| 23 |
+
`step4/checkpoints/reshape_embedding_step4.pt`:
|
| 24 |
+
|
| 25 |
+
- (1) This checkpoint serves as the starting point for fine-tuning on the LIBERO benchmark.
|
| 26 |
+
- (2) It is a post-trained model derived from CronusVLA-7B, configured with a four-step frame (history length = 3).
|
| 27 |
+
- (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
|
| 28 |
+
- (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
|
| 29 |
+
|
| 30 |
+
> If you want to use these checkpoint to finetune, please follow the instruction of [CronusVLA](https://github.com/InternRobotics/CronusVLA).
|
step2/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model:
|
| 3 |
+
- openvla/openvla-7b-prismatic
|
| 4 |
+
- CogACT/CogACT-Base
|
| 5 |
+
tags:
|
| 6 |
+
- robotics
|
| 7 |
+
- vla
|
| 8 |
+
- multimodal
|
| 9 |
+
- pretraining
|
| 10 |
+
pipeline_tag: robotics
|
| 11 |
+
---
|
| 12 |
+
# Model Card for CronusVLA
|
| 13 |
+
|
| 14 |
+
**Weights**
|
| 15 |
+
|
| 16 |
+
`checkpoints/reshape_embedding_step2.pt`:
|
| 17 |
+
|
| 18 |
+
- (1) This checkpoint serves as the starting point for fine-tuning on the Libero benchmark.
|
| 19 |
+
- (2) It is a post-trained model derived from CronusVLA-7B, configured with a two-step frame (history length = 1).
|
| 20 |
+
- (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
|
| 21 |
+
- (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
|
| 22 |
+
|
| 23 |
+
> If you want to use these checkpoint to finetune, please follow the instruction of [CronusVLA](https://github.com/InternRobotics/CronusVLA).
|
step2/checkpoints/reshape_embedding_step2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc133922d2b5fce6ef42476b4557d26de4789da444cf24afff1417dc05e9f612
|
| 3 |
+
size 30703958828
|
step2/config.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"action_dim": 7,
|
| 3 |
+
"action_model_type": "DiT-B",
|
| 4 |
+
"data_root_dir": "",
|
| 5 |
+
"debug": false,
|
| 6 |
+
"future_action_window_size": 15,
|
| 7 |
+
"hf_token": "hf_token",
|
| 8 |
+
"image_aug": true,
|
| 9 |
+
"is_resume": false,
|
| 10 |
+
"load_all_data_for_training": true,
|
| 11 |
+
"past_action_window_size": 1,
|
| 12 |
+
"pretrained_checkpoint": "path/to/ckpt",
|
| 13 |
+
"repeated_diffusion_steps": 4,
|
| 14 |
+
"resume_epoch": null,
|
| 15 |
+
"resume_step": null,
|
| 16 |
+
"run_id": "step2_7B",
|
| 17 |
+
"run_id_note": null,
|
| 18 |
+
"run_root_dir": "outputs/step2_7B",
|
| 19 |
+
"save_interval": 2500,
|
| 20 |
+
"seed": 42,
|
| 21 |
+
"trackers": [
|
| 22 |
+
"jsonl",
|
| 23 |
+
"wandb"
|
| 24 |
+
],
|
| 25 |
+
"use_ema": false,
|
| 26 |
+
"vla": {
|
| 27 |
+
"base_vlm": "prism-dinosiglip-224px+7b",
|
| 28 |
+
"data_mix": "bridge_rt_1",
|
| 29 |
+
"enable_gradient_checkpointing": true,
|
| 30 |
+
"enable_mixed_precision_training": true,
|
| 31 |
+
"epochs": 100,
|
| 32 |
+
"expected_world_size": 16,
|
| 33 |
+
"freeze_llm_backbone": false,
|
| 34 |
+
"freeze_vision_backbone": false,
|
| 35 |
+
"global_batch_size": 256,
|
| 36 |
+
"learning_rate": 2e-05,
|
| 37 |
+
"lr_scheduler_type": "constant",
|
| 38 |
+
"max_grad_norm": 1.0,
|
| 39 |
+
"max_steps": null,
|
| 40 |
+
"per_device_batch_size": 16,
|
| 41 |
+
"reduce_in_full_precision": true,
|
| 42 |
+
"shuffle_buffer_size": 250000,
|
| 43 |
+
"train_strategy": "fsdp-full-shard",
|
| 44 |
+
"type": "prism-dinosiglip-224px+oxe+diffusion",
|
| 45 |
+
"unfreeze_last_llm_layer": false,
|
| 46 |
+
"vla_id": "prism-dinosiglip-224px+oxe+diffusion",
|
| 47 |
+
"warmup_ratio": 0.0,
|
| 48 |
+
"weight_decay": 0.0
|
| 49 |
+
},
|
| 50 |
+
"wandb_entity": "",
|
| 51 |
+
"wandb_project": ""
|
| 52 |
+
}
|
step2/config.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
action_dim: 7
|
| 2 |
+
action_model_type: DiT-B
|
| 3 |
+
data_root_dir: ""
|
| 4 |
+
debug: false
|
| 5 |
+
future_action_window_size: 15
|
| 6 |
+
hf_token: hf_token
|
| 7 |
+
image_aug: true
|
| 8 |
+
is_resume: false
|
| 9 |
+
load_all_data_for_training: true
|
| 10 |
+
past_action_window_size: 1
|
| 11 |
+
pretrained_checkpoint: "path/to/ckpt"
|
| 12 |
+
repeated_diffusion_steps: 4
|
| 13 |
+
resume_epoch: null
|
| 14 |
+
resume_step: null
|
| 15 |
+
run_id: step2_7B
|
| 16 |
+
run_id_note: null
|
| 17 |
+
run_root_dir: outputs/step2_7B
|
| 18 |
+
save_interval: 2500
|
| 19 |
+
seed: 42
|
| 20 |
+
trackers:
|
| 21 |
+
- jsonl
|
| 22 |
+
- wandb
|
| 23 |
+
use_ema: false
|
| 24 |
+
vla:
|
| 25 |
+
base_vlm: prism-dinosiglip-224px+7b
|
| 26 |
+
data_mix: bridge_rt_1
|
| 27 |
+
enable_gradient_checkpointing: true
|
| 28 |
+
enable_mixed_precision_training: true
|
| 29 |
+
epochs: 100
|
| 30 |
+
expected_world_size: 16
|
| 31 |
+
freeze_llm_backbone: false
|
| 32 |
+
freeze_vision_backbone: false
|
| 33 |
+
global_batch_size: 256
|
| 34 |
+
learning_rate: 2.0e-05
|
| 35 |
+
lr_scheduler_type: constant
|
| 36 |
+
max_grad_norm: 1.0
|
| 37 |
+
max_steps: null
|
| 38 |
+
per_device_batch_size: 16
|
| 39 |
+
reduce_in_full_precision: true
|
| 40 |
+
shuffle_buffer_size: 250000
|
| 41 |
+
train_strategy: fsdp-full-shard
|
| 42 |
+
type: prism-dinosiglip-224px+oxe+diffusion
|
| 43 |
+
unfreeze_last_llm_layer: false
|
| 44 |
+
vla_id: prism-dinosiglip-224px+oxe+diffusion
|
| 45 |
+
warmup_ratio: 0.0
|
| 46 |
+
weight_decay: 0.0
|
| 47 |
+
wandb_entity: ""
|
| 48 |
+
wandb_project: ""
|
step2/dataset_statistics.json
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bridge_dataset": {
|
| 3 |
+
"action": {
|
| 4 |
+
"mean": [
|
| 5 |
+
0.00023341973428614438,
|
| 6 |
+
0.0001300475705647841,
|
| 7 |
+
-0.00012762400729116052,
|
| 8 |
+
-0.00015565499779768288,
|
| 9 |
+
-0.00040393517701886594,
|
| 10 |
+
0.0002355772303417325,
|
| 11 |
+
0.5764579772949219
|
| 12 |
+
],
|
| 13 |
+
"std": [
|
| 14 |
+
0.009765934199094772,
|
| 15 |
+
0.013689189217984676,
|
| 16 |
+
0.012667394243180752,
|
| 17 |
+
0.028534121811389923,
|
| 18 |
+
0.030637938529253006,
|
| 19 |
+
0.07691467553377151,
|
| 20 |
+
0.4973696768283844
|
| 21 |
+
],
|
| 22 |
+
"max": [
|
| 23 |
+
0.41691166162490845,
|
| 24 |
+
0.25864794850349426,
|
| 25 |
+
0.21218234300613403,
|
| 26 |
+
3.122201919555664,
|
| 27 |
+
1.8618112802505493,
|
| 28 |
+
6.280478477478027,
|
| 29 |
+
1.0
|
| 30 |
+
],
|
| 31 |
+
"min": [
|
| 32 |
+
-0.4007510244846344,
|
| 33 |
+
-0.13874775171279907,
|
| 34 |
+
-0.22553899884223938,
|
| 35 |
+
-3.2010786533355713,
|
| 36 |
+
-1.8618112802505493,
|
| 37 |
+
-6.279075622558594,
|
| 38 |
+
0.0
|
| 39 |
+
],
|
| 40 |
+
"q01": [
|
| 41 |
+
-0.02872725307941437,
|
| 42 |
+
-0.04170349963009357,
|
| 43 |
+
-0.026093858778476715,
|
| 44 |
+
-0.08092105075716972,
|
| 45 |
+
-0.09288699507713317,
|
| 46 |
+
-0.20718276381492615,
|
| 47 |
+
0.0
|
| 48 |
+
],
|
| 49 |
+
"q99": [
|
| 50 |
+
0.028309678435325586,
|
| 51 |
+
0.040855254605412394,
|
| 52 |
+
0.040161586627364146,
|
| 53 |
+
0.08192047759890528,
|
| 54 |
+
0.07792850524187081,
|
| 55 |
+
0.20382574498653397,
|
| 56 |
+
1.0
|
| 57 |
+
],
|
| 58 |
+
"mask": [
|
| 59 |
+
true,
|
| 60 |
+
true,
|
| 61 |
+
true,
|
| 62 |
+
true,
|
| 63 |
+
true,
|
| 64 |
+
true,
|
| 65 |
+
false
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
"proprio": {
|
| 69 |
+
"mean": [
|
| 70 |
+
0.0,
|
| 71 |
+
0.0,
|
| 72 |
+
0.0,
|
| 73 |
+
0.0,
|
| 74 |
+
0.0,
|
| 75 |
+
0.0,
|
| 76 |
+
0.0
|
| 77 |
+
],
|
| 78 |
+
"std": [
|
| 79 |
+
0.0,
|
| 80 |
+
0.0,
|
| 81 |
+
0.0,
|
| 82 |
+
0.0,
|
| 83 |
+
0.0,
|
| 84 |
+
0.0,
|
| 85 |
+
0.0
|
| 86 |
+
],
|
| 87 |
+
"max": [
|
| 88 |
+
0.0,
|
| 89 |
+
0.0,
|
| 90 |
+
0.0,
|
| 91 |
+
0.0,
|
| 92 |
+
0.0,
|
| 93 |
+
0.0,
|
| 94 |
+
0.0
|
| 95 |
+
],
|
| 96 |
+
"min": [
|
| 97 |
+
0.0,
|
| 98 |
+
0.0,
|
| 99 |
+
0.0,
|
| 100 |
+
0.0,
|
| 101 |
+
0.0,
|
| 102 |
+
0.0,
|
| 103 |
+
0.0
|
| 104 |
+
],
|
| 105 |
+
"q01": [
|
| 106 |
+
0.0,
|
| 107 |
+
0.0,
|
| 108 |
+
0.0,
|
| 109 |
+
0.0,
|
| 110 |
+
0.0,
|
| 111 |
+
0.0,
|
| 112 |
+
0.0
|
| 113 |
+
],
|
| 114 |
+
"q99": [
|
| 115 |
+
0.0,
|
| 116 |
+
0.0,
|
| 117 |
+
0.0,
|
| 118 |
+
0.0,
|
| 119 |
+
0.0,
|
| 120 |
+
0.0,
|
| 121 |
+
0.0
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
"num_transitions": 2135463,
|
| 125 |
+
"num_trajectories": 60064
|
| 126 |
+
},
|
| 127 |
+
"fractal20220817_data": {
|
| 128 |
+
"action": {
|
| 129 |
+
"mean": [
|
| 130 |
+
0.006987491622567177,
|
| 131 |
+
0.00626587588340044,
|
| 132 |
+
-0.012625089846551418,
|
| 133 |
+
0.04333178699016571,
|
| 134 |
+
-0.005756180267781019,
|
| 135 |
+
0.0009131028782576323,
|
| 136 |
+
0.5354204773902893
|
| 137 |
+
],
|
| 138 |
+
"std": [
|
| 139 |
+
0.06921201944351196,
|
| 140 |
+
0.059655144810676575,
|
| 141 |
+
0.0735311210155487,
|
| 142 |
+
0.15610052645206451,
|
| 143 |
+
0.131641685962677,
|
| 144 |
+
0.14593306183815002,
|
| 145 |
+
0.49710750579833984
|
| 146 |
+
],
|
| 147 |
+
"max": [
|
| 148 |
+
2.9984593391418457,
|
| 149 |
+
22.09052848815918,
|
| 150 |
+
2.7507524490356445,
|
| 151 |
+
1.570636510848999,
|
| 152 |
+
1.5321086645126343,
|
| 153 |
+
1.5691522359848022,
|
| 154 |
+
1.0
|
| 155 |
+
],
|
| 156 |
+
"min": [
|
| 157 |
+
-2.0204520225524902,
|
| 158 |
+
-5.497899532318115,
|
| 159 |
+
-2.031663417816162,
|
| 160 |
+
-1.569917917251587,
|
| 161 |
+
-1.569892168045044,
|
| 162 |
+
-1.570419430732727,
|
| 163 |
+
0.0
|
| 164 |
+
],
|
| 165 |
+
"q01": [
|
| 166 |
+
-0.22453527510166169,
|
| 167 |
+
-0.14820013284683228,
|
| 168 |
+
-0.231589707583189,
|
| 169 |
+
-0.3517994859814644,
|
| 170 |
+
-0.4193011274933815,
|
| 171 |
+
-0.43643461108207704,
|
| 172 |
+
0.0
|
| 173 |
+
],
|
| 174 |
+
"q99": [
|
| 175 |
+
0.17824687153100965,
|
| 176 |
+
0.14938379630446405,
|
| 177 |
+
0.21842354819178575,
|
| 178 |
+
0.5892666035890578,
|
| 179 |
+
0.35272657424211445,
|
| 180 |
+
0.44796681255102094,
|
| 181 |
+
1.0
|
| 182 |
+
],
|
| 183 |
+
"mask": [
|
| 184 |
+
true,
|
| 185 |
+
true,
|
| 186 |
+
true,
|
| 187 |
+
true,
|
| 188 |
+
true,
|
| 189 |
+
true,
|
| 190 |
+
false
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
"proprio": {
|
| 194 |
+
"mean": [
|
| 195 |
+
0.0,
|
| 196 |
+
0.0,
|
| 197 |
+
0.0,
|
| 198 |
+
0.0,
|
| 199 |
+
0.0,
|
| 200 |
+
0.0,
|
| 201 |
+
0.0
|
| 202 |
+
],
|
| 203 |
+
"std": [
|
| 204 |
+
0.0,
|
| 205 |
+
0.0,
|
| 206 |
+
0.0,
|
| 207 |
+
0.0,
|
| 208 |
+
0.0,
|
| 209 |
+
0.0,
|
| 210 |
+
0.0
|
| 211 |
+
],
|
| 212 |
+
"max": [
|
| 213 |
+
0.0,
|
| 214 |
+
0.0,
|
| 215 |
+
0.0,
|
| 216 |
+
0.0,
|
| 217 |
+
0.0,
|
| 218 |
+
0.0,
|
| 219 |
+
0.0
|
| 220 |
+
],
|
| 221 |
+
"min": [
|
| 222 |
+
0.0,
|
| 223 |
+
0.0,
|
| 224 |
+
0.0,
|
| 225 |
+
0.0,
|
| 226 |
+
0.0,
|
| 227 |
+
0.0,
|
| 228 |
+
0.0
|
| 229 |
+
],
|
| 230 |
+
"q01": [
|
| 231 |
+
0.0,
|
| 232 |
+
0.0,
|
| 233 |
+
0.0,
|
| 234 |
+
0.0,
|
| 235 |
+
0.0,
|
| 236 |
+
0.0,
|
| 237 |
+
0.0
|
| 238 |
+
],
|
| 239 |
+
"q99": [
|
| 240 |
+
0.0,
|
| 241 |
+
0.0,
|
| 242 |
+
0.0,
|
| 243 |
+
0.0,
|
| 244 |
+
0.0,
|
| 245 |
+
0.0,
|
| 246 |
+
0.0
|
| 247 |
+
]
|
| 248 |
+
},
|
| 249 |
+
"num_transitions": 3786400,
|
| 250 |
+
"num_trajectories": 87212
|
| 251 |
+
}
|
| 252 |
+
}
|
step2/run-metrics.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hparams": {"action_dim": 7, "action_model_type": "DiT-B", "data_root_dir": "", "debug": false, "future_action_window_size": 15, "hf_token": "hf_token", "image_aug": true, "is_resume": false, "load_all_data_for_training": true, "past_action_window_size": 1, "pretrained_checkpoint": "path/to/ckpt", "repeated_diffusion_steps": 4, "resume_epoch": null, "resume_step": null, "run_id": "step2_7B", "run_id_note": null, "run_root_dir": "outputs/step2_7B", "save_interval": 5000, "seed": 42, "trackers": ["jsonl", "wandb"], "use_ema": false, "vla": {"base_vlm": "prism-dinosiglip-224px+7b", "data_mix": "bridge_rt_1", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "epochs": 100, "expected_world_size": 16, "freeze_llm_backbone": false, "freeze_vision_backbone": false, "global_batch_size": 256, "learning_rate": 2e-05, "lr_scheduler_type": "constant", "max_grad_norm": 1.0, "max_steps": null, "per_device_batch_size": 16, "reduce_in_full_precision": true, "shuffle_buffer_size": 250000, "train_strategy": "fsdp-full-shard", "type": "prism-dinosiglip-224px+oxe+diffusion", "unfreeze_last_llm_layer": false, "vla_id": "prism-dinosiglip-224px+oxe+diffusion", "warmup_ratio": 0.0, "weight_decay": 0.0}, "wandb_entity": "", "wandb_project": ""}, "run_id": "step2_7B"}
|
step4/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model:
|
| 3 |
+
- openvla/openvla-7b-prismatic
|
| 4 |
+
- CogACT/CogACT-Base
|
| 5 |
+
tags:
|
| 6 |
+
- robotics
|
| 7 |
+
- vla
|
| 8 |
+
- multimodal
|
| 9 |
+
- pretraining
|
| 10 |
+
pipeline_tag: robotics
|
| 11 |
+
---
|
| 12 |
+
# Model Card for CronusVLA
|
| 13 |
+
|
| 14 |
+
**Weights**
|
| 15 |
+
|
| 16 |
+
`step4/checkpoints/reshape_embedding_step4.pt`:
|
| 17 |
+
|
| 18 |
+
- (1) This checkpoint serves as the starting point for fine-tuning on the Libero benchmark.
|
| 19 |
+
- (2) It is a post-trained model derived from CronusVLA-7B, configured with a four-step frame (history length = 3).
|
| 20 |
+
- (3) During the original post-training phase, the model was trained to predict future actions with a chunk size of 16. To ensure compatibility with the Libero setting of OpenVLA-oft (chunk = 8), we modified the position embeddings responsible for action chunking in the original checkpoint.
|
| 21 |
+
- (4) Note: This checkpoint is not directly evaluable and is intended only as a fine-tuning initialization.
|
| 22 |
+
|
| 23 |
+
> If you want to use these checkpoint to finetune, please follow the instruction of [CronusVLA](https://github.com/InternRobotics/CronusVLA).
|
step4/checkpoints/reshape_embedding_step4.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2142c243d7b87f29396bcedf242b2f5fd170bfca8895a773a70c5847c45a91c
|
| 3 |
+
size 30703997740
|
step4/config.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"action_dim": 7,
|
| 3 |
+
"action_model_type": "DiT-B",
|
| 4 |
+
"data_root_dir": "",
|
| 5 |
+
"debug": false,
|
| 6 |
+
"future_action_window_size": 15,
|
| 7 |
+
"hf_token": "hf_token",
|
| 8 |
+
"image_aug": true,
|
| 9 |
+
"is_resume": false,
|
| 10 |
+
"load_all_data_for_training": true,
|
| 11 |
+
"past_action_window_size": 3,
|
| 12 |
+
"pretrained_checkpoint": "path/to/ckpt",
|
| 13 |
+
"repeated_diffusion_steps": 4,
|
| 14 |
+
"resume_epoch": null,
|
| 15 |
+
"resume_step": null,
|
| 16 |
+
"run_id": "step4_7B",
|
| 17 |
+
"run_id_note": null,
|
| 18 |
+
"run_root_dir": "outputs/step4_7B",
|
| 19 |
+
"save_interval": 2500,
|
| 20 |
+
"seed": 42,
|
| 21 |
+
"trackers": [
|
| 22 |
+
"jsonl",
|
| 23 |
+
"wandb"
|
| 24 |
+
],
|
| 25 |
+
"use_ema": false,
|
| 26 |
+
"vla": {
|
| 27 |
+
"base_vlm": "prism-dinosiglip-224px+7b",
|
| 28 |
+
"data_mix": "bridge_rt_1",
|
| 29 |
+
"enable_gradient_checkpointing": true,
|
| 30 |
+
"enable_mixed_precision_training": true,
|
| 31 |
+
"epochs": 100,
|
| 32 |
+
"expected_world_size": 64,
|
| 33 |
+
"freeze_llm_backbone": false,
|
| 34 |
+
"freeze_vision_backbone": false,
|
| 35 |
+
"global_batch_size": 512,
|
| 36 |
+
"learning_rate": 4e-05,
|
| 37 |
+
"lr_scheduler_type": "constant",
|
| 38 |
+
"max_grad_norm": 1.0,
|
| 39 |
+
"max_steps": null,
|
| 40 |
+
"per_device_batch_size": 8,
|
| 41 |
+
"reduce_in_full_precision": true,
|
| 42 |
+
"shuffle_buffer_size": 250000,
|
| 43 |
+
"train_strategy": "fsdp-full-shard",
|
| 44 |
+
"type": "prism-dinosiglip-224px+oxe+diffusion",
|
| 45 |
+
"unfreeze_last_llm_layer": false,
|
| 46 |
+
"vla_id": "prism-dinosiglip-224px+oxe+diffusion",
|
| 47 |
+
"warmup_ratio": 0.0,
|
| 48 |
+
"weight_decay": 0.0
|
| 49 |
+
},
|
| 50 |
+
"wandb_entity": "",
|
| 51 |
+
"wandb_project": ""
|
| 52 |
+
}
|
step4/config.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
action_dim: 7
|
| 2 |
+
action_model_type: DiT-B
|
| 3 |
+
data_root_dir: ""
|
| 4 |
+
debug: false
|
| 5 |
+
future_action_window_size: 15
|
| 6 |
+
hf_token: hf_token
|
| 7 |
+
image_aug: true
|
| 8 |
+
is_resume: false
|
| 9 |
+
load_all_data_for_training: true
|
| 10 |
+
past_action_window_size: 3
|
| 11 |
+
pretrained_checkpoint: "path/to/ckpt"
|
| 12 |
+
repeated_diffusion_steps: 4
|
| 13 |
+
resume_epoch: null
|
| 14 |
+
resume_step: null
|
| 15 |
+
run_id: step4_7B
|
| 16 |
+
run_id_note: null
|
| 17 |
+
run_root_dir: outputs/step4_7B
|
| 18 |
+
save_interval: 2500
|
| 19 |
+
seed: 42
|
| 20 |
+
trackers:
|
| 21 |
+
- jsonl
|
| 22 |
+
- wandb
|
| 23 |
+
use_ema: false
|
| 24 |
+
vla:
|
| 25 |
+
base_vlm: prism-dinosiglip-224px+7b
|
| 26 |
+
data_mix: bridge_rt_1
|
| 27 |
+
enable_gradient_checkpointing: true
|
| 28 |
+
enable_mixed_precision_training: true
|
| 29 |
+
epochs: 100
|
| 30 |
+
expected_world_size: 64
|
| 31 |
+
freeze_llm_backbone: false
|
| 32 |
+
freeze_vision_backbone: false
|
| 33 |
+
global_batch_size: 512
|
| 34 |
+
learning_rate: 4.0e-05
|
| 35 |
+
lr_scheduler_type: constant
|
| 36 |
+
max_grad_norm: 1.0
|
| 37 |
+
max_steps: null
|
| 38 |
+
per_device_batch_size: 8
|
| 39 |
+
reduce_in_full_precision: true
|
| 40 |
+
shuffle_buffer_size: 250000
|
| 41 |
+
train_strategy: fsdp-full-shard
|
| 42 |
+
type: prism-dinosiglip-224px+oxe+diffusion
|
| 43 |
+
unfreeze_last_llm_layer: false
|
| 44 |
+
vla_id: prism-dinosiglip-224px+oxe+diffusion
|
| 45 |
+
warmup_ratio: 0.0
|
| 46 |
+
weight_decay: 0.0
|
| 47 |
+
wandb_entity: ""
|
| 48 |
+
wandb_project: ""
|
step4/dataset_statistics.json
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bridge_dataset": {
|
| 3 |
+
"action": {
|
| 4 |
+
"mean": [
|
| 5 |
+
0.00023341973428614438,
|
| 6 |
+
0.0001300475705647841,
|
| 7 |
+
-0.00012762400729116052,
|
| 8 |
+
-0.00015565499779768288,
|
| 9 |
+
-0.00040393517701886594,
|
| 10 |
+
0.0002355772303417325,
|
| 11 |
+
0.5764579772949219
|
| 12 |
+
],
|
| 13 |
+
"std": [
|
| 14 |
+
0.009765934199094772,
|
| 15 |
+
0.013689189217984676,
|
| 16 |
+
0.012667394243180752,
|
| 17 |
+
0.028534121811389923,
|
| 18 |
+
0.030637938529253006,
|
| 19 |
+
0.07691467553377151,
|
| 20 |
+
0.4973696768283844
|
| 21 |
+
],
|
| 22 |
+
"max": [
|
| 23 |
+
0.41691166162490845,
|
| 24 |
+
0.25864794850349426,
|
| 25 |
+
0.21218234300613403,
|
| 26 |
+
3.122201919555664,
|
| 27 |
+
1.8618112802505493,
|
| 28 |
+
6.280478477478027,
|
| 29 |
+
1.0
|
| 30 |
+
],
|
| 31 |
+
"min": [
|
| 32 |
+
-0.4007510244846344,
|
| 33 |
+
-0.13874775171279907,
|
| 34 |
+
-0.22553899884223938,
|
| 35 |
+
-3.2010786533355713,
|
| 36 |
+
-1.8618112802505493,
|
| 37 |
+
-6.279075622558594,
|
| 38 |
+
0.0
|
| 39 |
+
],
|
| 40 |
+
"q01": [
|
| 41 |
+
-0.02872725307941437,
|
| 42 |
+
-0.04170349963009357,
|
| 43 |
+
-0.026093858778476715,
|
| 44 |
+
-0.08092105075716972,
|
| 45 |
+
-0.09288699507713317,
|
| 46 |
+
-0.20718276381492615,
|
| 47 |
+
0.0
|
| 48 |
+
],
|
| 49 |
+
"q99": [
|
| 50 |
+
0.028309678435325586,
|
| 51 |
+
0.040855254605412394,
|
| 52 |
+
0.040161586627364146,
|
| 53 |
+
0.08192047759890528,
|
| 54 |
+
0.07792850524187081,
|
| 55 |
+
0.20382574498653397,
|
| 56 |
+
1.0
|
| 57 |
+
],
|
| 58 |
+
"mask": [
|
| 59 |
+
true,
|
| 60 |
+
true,
|
| 61 |
+
true,
|
| 62 |
+
true,
|
| 63 |
+
true,
|
| 64 |
+
true,
|
| 65 |
+
false
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
"proprio": {
|
| 69 |
+
"mean": [
|
| 70 |
+
0.0,
|
| 71 |
+
0.0,
|
| 72 |
+
0.0,
|
| 73 |
+
0.0,
|
| 74 |
+
0.0,
|
| 75 |
+
0.0,
|
| 76 |
+
0.0
|
| 77 |
+
],
|
| 78 |
+
"std": [
|
| 79 |
+
0.0,
|
| 80 |
+
0.0,
|
| 81 |
+
0.0,
|
| 82 |
+
0.0,
|
| 83 |
+
0.0,
|
| 84 |
+
0.0,
|
| 85 |
+
0.0
|
| 86 |
+
],
|
| 87 |
+
"max": [
|
| 88 |
+
0.0,
|
| 89 |
+
0.0,
|
| 90 |
+
0.0,
|
| 91 |
+
0.0,
|
| 92 |
+
0.0,
|
| 93 |
+
0.0,
|
| 94 |
+
0.0
|
| 95 |
+
],
|
| 96 |
+
"min": [
|
| 97 |
+
0.0,
|
| 98 |
+
0.0,
|
| 99 |
+
0.0,
|
| 100 |
+
0.0,
|
| 101 |
+
0.0,
|
| 102 |
+
0.0,
|
| 103 |
+
0.0
|
| 104 |
+
],
|
| 105 |
+
"q01": [
|
| 106 |
+
0.0,
|
| 107 |
+
0.0,
|
| 108 |
+
0.0,
|
| 109 |
+
0.0,
|
| 110 |
+
0.0,
|
| 111 |
+
0.0,
|
| 112 |
+
0.0
|
| 113 |
+
],
|
| 114 |
+
"q99": [
|
| 115 |
+
0.0,
|
| 116 |
+
0.0,
|
| 117 |
+
0.0,
|
| 118 |
+
0.0,
|
| 119 |
+
0.0,
|
| 120 |
+
0.0,
|
| 121 |
+
0.0
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
"num_transitions": 2135463,
|
| 125 |
+
"num_trajectories": 60064
|
| 126 |
+
},
|
| 127 |
+
"fractal20220817_data": {
|
| 128 |
+
"action": {
|
| 129 |
+
"mean": [
|
| 130 |
+
0.006987491622567177,
|
| 131 |
+
0.00626587588340044,
|
| 132 |
+
-0.012625089846551418,
|
| 133 |
+
0.04333178699016571,
|
| 134 |
+
-0.005756180267781019,
|
| 135 |
+
0.0009131028782576323,
|
| 136 |
+
0.5354204773902893
|
| 137 |
+
],
|
| 138 |
+
"std": [
|
| 139 |
+
0.06921201944351196,
|
| 140 |
+
0.059655144810676575,
|
| 141 |
+
0.0735311210155487,
|
| 142 |
+
0.15610052645206451,
|
| 143 |
+
0.131641685962677,
|
| 144 |
+
0.14593306183815002,
|
| 145 |
+
0.49710750579833984
|
| 146 |
+
],
|
| 147 |
+
"max": [
|
| 148 |
+
2.9984593391418457,
|
| 149 |
+
22.09052848815918,
|
| 150 |
+
2.7507524490356445,
|
| 151 |
+
1.570636510848999,
|
| 152 |
+
1.5321086645126343,
|
| 153 |
+
1.5691522359848022,
|
| 154 |
+
1.0
|
| 155 |
+
],
|
| 156 |
+
"min": [
|
| 157 |
+
-2.0204520225524902,
|
| 158 |
+
-5.497899532318115,
|
| 159 |
+
-2.031663417816162,
|
| 160 |
+
-1.569917917251587,
|
| 161 |
+
-1.569892168045044,
|
| 162 |
+
-1.570419430732727,
|
| 163 |
+
0.0
|
| 164 |
+
],
|
| 165 |
+
"q01": [
|
| 166 |
+
-0.22453527510166169,
|
| 167 |
+
-0.14820013284683228,
|
| 168 |
+
-0.231589707583189,
|
| 169 |
+
-0.3517994859814644,
|
| 170 |
+
-0.4193011274933815,
|
| 171 |
+
-0.43643461108207704,
|
| 172 |
+
0.0
|
| 173 |
+
],
|
| 174 |
+
"q99": [
|
| 175 |
+
0.17824687153100965,
|
| 176 |
+
0.14938379630446405,
|
| 177 |
+
0.21842354819178575,
|
| 178 |
+
0.5892666035890578,
|
| 179 |
+
0.35272657424211445,
|
| 180 |
+
0.44796681255102094,
|
| 181 |
+
1.0
|
| 182 |
+
],
|
| 183 |
+
"mask": [
|
| 184 |
+
true,
|
| 185 |
+
true,
|
| 186 |
+
true,
|
| 187 |
+
true,
|
| 188 |
+
true,
|
| 189 |
+
true,
|
| 190 |
+
false
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
"proprio": {
|
| 194 |
+
"mean": [
|
| 195 |
+
0.0,
|
| 196 |
+
0.0,
|
| 197 |
+
0.0,
|
| 198 |
+
0.0,
|
| 199 |
+
0.0,
|
| 200 |
+
0.0,
|
| 201 |
+
0.0
|
| 202 |
+
],
|
| 203 |
+
"std": [
|
| 204 |
+
0.0,
|
| 205 |
+
0.0,
|
| 206 |
+
0.0,
|
| 207 |
+
0.0,
|
| 208 |
+
0.0,
|
| 209 |
+
0.0,
|
| 210 |
+
0.0
|
| 211 |
+
],
|
| 212 |
+
"max": [
|
| 213 |
+
0.0,
|
| 214 |
+
0.0,
|
| 215 |
+
0.0,
|
| 216 |
+
0.0,
|
| 217 |
+
0.0,
|
| 218 |
+
0.0,
|
| 219 |
+
0.0
|
| 220 |
+
],
|
| 221 |
+
"min": [
|
| 222 |
+
0.0,
|
| 223 |
+
0.0,
|
| 224 |
+
0.0,
|
| 225 |
+
0.0,
|
| 226 |
+
0.0,
|
| 227 |
+
0.0,
|
| 228 |
+
0.0
|
| 229 |
+
],
|
| 230 |
+
"q01": [
|
| 231 |
+
0.0,
|
| 232 |
+
0.0,
|
| 233 |
+
0.0,
|
| 234 |
+
0.0,
|
| 235 |
+
0.0,
|
| 236 |
+
0.0,
|
| 237 |
+
0.0
|
| 238 |
+
],
|
| 239 |
+
"q99": [
|
| 240 |
+
0.0,
|
| 241 |
+
0.0,
|
| 242 |
+
0.0,
|
| 243 |
+
0.0,
|
| 244 |
+
0.0,
|
| 245 |
+
0.0,
|
| 246 |
+
0.0
|
| 247 |
+
]
|
| 248 |
+
},
|
| 249 |
+
"num_transitions": 3786400,
|
| 250 |
+
"num_trajectories": 87212
|
| 251 |
+
}
|
| 252 |
+
}
|
step4/run-metrics.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"hparams": {"action_dim": 7, "action_model_type": "DiT-B", "data_root_dir": "", "debug": false, "future_action_window_size": 15, "hf_token": "hf_token", "image_aug": true, "is_resume": false, "load_all_data_for_training": true, "past_action_window_size": 3, "pretrained_checkpoint": "path/to/ckpt", "repeated_diffusion_steps": 4, "resume_epoch": null, "resume_step": null, "run_id": "step4_7B", "run_id_note": null, "run_root_dir": "outputs/step4_7B", "save_interval": 2500, "seed": 42, "trackers": ["jsonl", "wandb"], "use_ema": false, "vla": {"base_vlm": "prism-dinosiglip-224px+7b", "data_mix": "bridge_rt_1", "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true, "epochs": 100, "expected_world_size": 64, "freeze_llm_backbone": false, "freeze_vision_backbone": false, "global_batch_size": 512, "learning_rate": 4e-05, "lr_scheduler_type": "constant", "max_grad_norm": 1.0, "max_steps": null, "per_device_batch_size": 8, "reduce_in_full_precision": true, "shuffle_buffer_size": 250000, "train_strategy": "fsdp-full-shard", "type": "prism-dinosiglip-224px+oxe+diffusion", "unfreeze_last_llm_layer": false, "vla_id": "prism-dinosiglip-224px+oxe+diffusion", "warmup_ratio": 0.0, "weight_decay": 0.0}, "wandb_entity": "", "wandb_project": ""}, "run_id": "step4_7B"}
|