Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- checkpoints/finetune_task2_2000step/checkpoints/steps_1000_pytorch_model.pt +3 -0
- checkpoints/finetune_task2_2000step/checkpoints/steps_1500_pytorch_model.pt +3 -0
- checkpoints/finetune_task2_2000step/checkpoints/steps_2000_pytorch_model.pt +3 -0
- checkpoints/finetune_task2_2000step/checkpoints/steps_500_pytorch_model.pt +3 -0
- checkpoints/finetune_task2_2000step/config.yaml +48 -0
- checkpoints/finetune_task2_2000step/dataset_statistics.json +133 -0
- checkpoints/finetune_task2_2000step/final_model/pytorch_model.pt +3 -0
- checkpoints/finetune_task2_2000step/summary.jsonl +4 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/debug-internal.log +12 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/debug.log +0 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/config.yaml +67 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/output.log +82 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/requirements.txt +190 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-metadata.json +44 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-summary.json +1 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-core.log +14 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-internal.log +11 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug.log +0 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/run-2e1zogxz.wandb +0 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/config.yaml +71 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/output.log +82 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/requirements.txt +190 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-metadata.json +48 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-summary.json +1 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-core.log +14 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-internal.log +11 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug.log +0 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/run-uva2jmul.wandb +0 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/config.yaml +73 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/output.log +232 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/requirements.txt +190 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-metadata.json +48 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-summary.json +1 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-core.log +19 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-internal.log +12 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug.log +0 -0
- checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb +3 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
checkpoints/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
checkpoints/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
checkpoints/pretrained_goal_2000step/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 36 |
checkpoints/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
checkpoints/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
checkpoints/pretrained_goal_2000step/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb filter=lfs diff=lfs merge=lfs -text
|
checkpoints/finetune_task2_2000step/checkpoints/steps_1000_pytorch_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2825bcde687df5eef6a4abd3d1ccab704277070ba28e50e381e3b4ec8741cc9
|
| 3 |
+
size 8146438221
|
checkpoints/finetune_task2_2000step/checkpoints/steps_1500_pytorch_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87e58ac468ca067ca7dce9078c9957121403e6a927479f51c778780093c086b3
|
| 3 |
+
size 8146438221
|
checkpoints/finetune_task2_2000step/checkpoints/steps_2000_pytorch_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4412111555fc90044e5324217edaea9f3e30a1a448663bba796b9a0ca6c528a6
|
| 3 |
+
size 8146438221
|
checkpoints/finetune_task2_2000step/checkpoints/steps_500_pytorch_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9959f7ffeea3f330078d04e14a267dffacf5f92622d43900af3d02317eb0d3c
|
| 3 |
+
size 8146437392
|
checkpoints/finetune_task2_2000step/config.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets:
|
| 2 |
+
vla_data:
|
| 3 |
+
CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
|
| 4 |
+
Locate their bounding boxes in [x1,y1,x2,y2] format.
|
| 5 |
+
data_mix: libero_90_task_2
|
| 6 |
+
data_root_dir: playground/Datasets/LEROBOT_LIBERO_DATA
|
| 7 |
+
dataset_py: lerobot_datasets
|
| 8 |
+
per_device_batch_size: 1
|
| 9 |
+
sequential_step_sampling: false
|
| 10 |
+
video_backend: torchvision_av
|
| 11 |
+
framework:
|
| 12 |
+
action_model:
|
| 13 |
+
action_dim: 7
|
| 14 |
+
future_action_window_size: 7
|
| 15 |
+
past_action_window_size: 0
|
| 16 |
+
name: QwenFast
|
| 17 |
+
qwenvl:
|
| 18 |
+
base_vlm: playground/Pretrained_models/Qwen2.5-VL-3B-Instruct-Action
|
| 19 |
+
output_dir: ./results/Checkpoints/finetune_task2_2000step
|
| 20 |
+
run_id: finetune_task2_2000step
|
| 21 |
+
run_root_dir: ./results/Checkpoints
|
| 22 |
+
seed: 42
|
| 23 |
+
trainer:
|
| 24 |
+
eval_interval: 100
|
| 25 |
+
freeze_modules: qwen_vl_interface.model.model.visual,dino_encoder
|
| 26 |
+
gradient_accumulation_steps: 1
|
| 27 |
+
gradient_clipping: 1.0
|
| 28 |
+
is_resume: true
|
| 29 |
+
learning_rate:
|
| 30 |
+
action_model: 0.0001
|
| 31 |
+
base: 2.5e-05
|
| 32 |
+
qwen_vl_interface: 1.0e-05
|
| 33 |
+
logging_frequency: 100
|
| 34 |
+
lr_scheduler_type: cosine_with_min_lr
|
| 35 |
+
max_train_steps: 2000
|
| 36 |
+
num_warmup_steps: 5000
|
| 37 |
+
optimizer:
|
| 38 |
+
betas:
|
| 39 |
+
- 0.9
|
| 40 |
+
- 0.95
|
| 41 |
+
eps: 1.0e-08
|
| 42 |
+
weight_decay: 1.0e-08
|
| 43 |
+
pretrained_checkpoint: /content/starVLA_r/results/Checkpoints/Qwen2.5-VL-FAST-LIBERO-4in1/checkpoints/steps_30000_pytorch_model.pt
|
| 44 |
+
save_interval: 500
|
| 45 |
+
scheduler_specific_kwargs:
|
| 46 |
+
min_lr: 1.0e-06
|
| 47 |
+
wandb_entity: michellelin9102-usc
|
| 48 |
+
wandb_project: starVLA_Libero
|
checkpoints/finetune_task2_2000step/dataset_statistics.json
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"franka": {
|
| 3 |
+
"action": {
|
| 4 |
+
"mean": [
|
| 5 |
+
0.03965260088443756,
|
| 6 |
+
0.13710077106952667,
|
| 7 |
+
-0.04964581876993179,
|
| 8 |
+
-0.00436883419752121,
|
| 9 |
+
0.0031783515587449074,
|
| 10 |
+
-0.00018181839550379664,
|
| 11 |
+
0.5124579071998596
|
| 12 |
+
],
|
| 13 |
+
"std": [
|
| 14 |
+
0.24294555187225342,
|
| 15 |
+
0.44865477085113525,
|
| 16 |
+
0.44734615087509155,
|
| 17 |
+
0.0339176170527935,
|
| 18 |
+
0.04405592009425163,
|
| 19 |
+
0.029885200783610344,
|
| 20 |
+
0.49982890486717224
|
| 21 |
+
],
|
| 22 |
+
"max": [
|
| 23 |
+
0.7794643044471741,
|
| 24 |
+
0.9375,
|
| 25 |
+
0.9375,
|
| 26 |
+
0.19499999284744263,
|
| 27 |
+
0.1907142847776413,
|
| 28 |
+
0.19928571581840515,
|
| 29 |
+
1.0
|
| 30 |
+
],
|
| 31 |
+
"min": [
|
| 32 |
+
-0.7151785492897034,
|
| 33 |
+
-0.8999999761581421,
|
| 34 |
+
-0.9241071343421936,
|
| 35 |
+
-0.16821429133415222,
|
| 36 |
+
-0.167142853140831,
|
| 37 |
+
-0.12964285910129547,
|
| 38 |
+
0.0
|
| 39 |
+
],
|
| 40 |
+
"q01": [
|
| 41 |
+
-0.46875,
|
| 42 |
+
-0.7232142686843872,
|
| 43 |
+
-0.8169642686843872,
|
| 44 |
+
-0.10821428894996643,
|
| 45 |
+
-0.11571428924798965,
|
| 46 |
+
-0.08142857253551483,
|
| 47 |
+
0.0
|
| 48 |
+
],
|
| 49 |
+
"q99": [
|
| 50 |
+
0.5839285850524902,
|
| 51 |
+
0.8919642567634583,
|
| 52 |
+
0.9375,
|
| 53 |
+
0.09535714238882065,
|
| 54 |
+
0.1398434042930603,
|
| 55 |
+
0.1039285734295845,
|
| 56 |
+
1.0
|
| 57 |
+
],
|
| 58 |
+
"mask": [
|
| 59 |
+
true,
|
| 60 |
+
true,
|
| 61 |
+
true,
|
| 62 |
+
true,
|
| 63 |
+
true,
|
| 64 |
+
true,
|
| 65 |
+
false
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
"state": {
|
| 69 |
+
"mean": [
|
| 70 |
+
0.06304012984037399,
|
| 71 |
+
-0.02723514847457409,
|
| 72 |
+
0.5950468182563782,
|
| 73 |
+
3.1040256023406982,
|
| 74 |
+
-0.0479881688952446,
|
| 75 |
+
-0.014697893522679806,
|
| 76 |
+
0.029381589964032173,
|
| 77 |
+
-0.030202925205230713
|
| 78 |
+
],
|
| 79 |
+
"std": [
|
| 80 |
+
0.05494280904531479,
|
| 81 |
+
0.17417463660240173,
|
| 82 |
+
0.08279268443584442,
|
| 83 |
+
0.06757557392120361,
|
| 84 |
+
0.16604064404964447,
|
| 85 |
+
0.1603231579065323,
|
| 86 |
+
0.00942574255168438,
|
| 87 |
+
0.009197638370096684
|
| 88 |
+
],
|
| 89 |
+
"max": [
|
| 90 |
+
0.17418493330478668,
|
| 91 |
+
0.30584609508514404,
|
| 92 |
+
0.7395508289337158,
|
| 93 |
+
3.3254528045654297,
|
| 94 |
+
0.5380978584289551,
|
| 95 |
+
0.45999088883399963,
|
| 96 |
+
0.04025300219655037,
|
| 97 |
+
-0.008219979703426361
|
| 98 |
+
],
|
| 99 |
+
"min": [
|
| 100 |
+
-0.08505505323410034,
|
| 101 |
+
-0.24681705236434937,
|
| 102 |
+
0.4457172751426697,
|
| 103 |
+
2.8618643283843994,
|
| 104 |
+
-0.6842642426490784,
|
| 105 |
+
-0.5939062833786011,
|
| 106 |
+
0.0075335511937737465,
|
| 107 |
+
-0.04111039638519287
|
| 108 |
+
],
|
| 109 |
+
"q01": [
|
| 110 |
+
-0.06130984425544739,
|
| 111 |
+
-0.23173466324806213,
|
| 112 |
+
0.446308970451355,
|
| 113 |
+
2.898547410964966,
|
| 114 |
+
-0.5309021472930908,
|
| 115 |
+
-0.4083949625492096,
|
| 116 |
+
0.009174905717372894,
|
| 117 |
+
-0.040189072489738464
|
| 118 |
+
],
|
| 119 |
+
"q99": [
|
| 120 |
+
0.15489375591278076,
|
| 121 |
+
0.2796362340450287,
|
| 122 |
+
0.719877302646637,
|
| 123 |
+
3.251077890396118,
|
| 124 |
+
0.38340237736701965,
|
| 125 |
+
0.3866870105266571,
|
| 126 |
+
0.03991854190826416,
|
| 127 |
+
-0.008571043610572815
|
| 128 |
+
]
|
| 129 |
+
},
|
| 130 |
+
"num_transitions": 7425,
|
| 131 |
+
"num_trajectories": 49
|
| 132 |
+
}
|
| 133 |
+
}
|
checkpoints/finetune_task2_2000step/final_model/pytorch_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52231b13169054e07d61aec13c590bd3dc26bfa7863f4f316d6438ab1ad96dcb
|
| 3 |
+
size 8146425390
|
checkpoints/finetune_task2_2000step/summary.jsonl
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"steps": 500}
|
| 2 |
+
{"steps": 1000}
|
| 3 |
+
{"steps": 1500}
|
| 4 |
+
{"steps": 2000}
|
checkpoints/finetune_task2_2000step/wandb/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-03-16T08:59:32.597734449Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-03-16T08:59:32.929605272Z","level":"INFO","msg":"stream: created new stream","id":"77uivys0"}
|
| 3 |
+
{"time":"2026-03-16T08:59:32.929695195Z","level":"INFO","msg":"handler: started","stream_id":"77uivys0"}
|
| 4 |
+
{"time":"2026-03-16T08:59:32.929863345Z","level":"INFO","msg":"stream: started","id":"77uivys0"}
|
| 5 |
+
{"time":"2026-03-16T08:59:32.929879846Z","level":"INFO","msg":"writer: started","stream_id":"77uivys0"}
|
| 6 |
+
{"time":"2026-03-16T08:59:32.929905429Z","level":"INFO","msg":"sender: started","stream_id":"77uivys0"}
|
| 7 |
+
{"time":"2026-03-16T09:22:36.766341662Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 8 |
+
{"time":"2026-03-16T09:22:37.079259981Z","level":"INFO","msg":"handler: operation stats","stats":{}}
|
| 9 |
+
{"time":"2026-03-16T09:22:37.085052418Z","level":"INFO","msg":"stream: closing","id":"77uivys0"}
|
| 10 |
+
{"time":"2026-03-16T09:22:37.085074372Z","level":"INFO","msg":"handler: closed","stream_id":"77uivys0"}
|
| 11 |
+
{"time":"2026-03-16T09:22:37.085174386Z","level":"INFO","msg":"sender: closed","stream_id":"77uivys0"}
|
| 12 |
+
{"time":"2026-03-16T09:22:37.08518525Z","level":"INFO","msg":"stream: closed","id":"77uivys0"}
|
checkpoints/finetune_task2_2000step/wandb/wandb/debug.log
ADDED
|
File without changes
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/config.yaml
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.24.2
|
| 4 |
+
e:
|
| 5 |
+
qhesnx8zyogcsl0ullfxd51tpupacfik:
|
| 6 |
+
args:
|
| 7 |
+
- --config_yaml
|
| 8 |
+
- ./examples/LIBERO/train_files/my_libero_finetune.yaml
|
| 9 |
+
codePath: starVLA/training/train_starvla.py
|
| 10 |
+
codePathLocal: starVLA/training/train_starvla.py
|
| 11 |
+
cpu_count: 6
|
| 12 |
+
cpu_count_logical: 12
|
| 13 |
+
cudaVersion: "13.0"
|
| 14 |
+
disk:
|
| 15 |
+
/:
|
| 16 |
+
total: "253055008768"
|
| 17 |
+
used: "154931621888"
|
| 18 |
+
email: chihhans@usc.edu
|
| 19 |
+
executable: /usr/local/envs/starvla/bin/python3.10
|
| 20 |
+
git:
|
| 21 |
+
commit: 87ed38d93933a6251cb05aaeaaf522ec2a4ea177
|
| 22 |
+
remote: https://github.com/tliao730/starVLA_r
|
| 23 |
+
gpu: NVIDIA A100-SXM4-80GB
|
| 24 |
+
gpu_count: 1
|
| 25 |
+
gpu_nvidia:
|
| 26 |
+
- architecture: Ampere
|
| 27 |
+
cudaCores: 6912
|
| 28 |
+
memoryTotal: "85899345920"
|
| 29 |
+
name: NVIDIA A100-SXM4-80GB
|
| 30 |
+
uuid: GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69
|
| 31 |
+
host: c89e62d63bf0
|
| 32 |
+
memory:
|
| 33 |
+
total: "179370471424"
|
| 34 |
+
os: Linux-6.6.113+-x86_64-with-glibc2.35
|
| 35 |
+
program: /content/starVLA_r/starVLA/training/train_starvla.py
|
| 36 |
+
python: CPython 3.10.20
|
| 37 |
+
root: ./results/Checkpoints/finetune_task2_2000step/wandb
|
| 38 |
+
startedAt: "2026-03-16T08:53:38.423184Z"
|
| 39 |
+
writerId: qhesnx8zyogcsl0ullfxd51tpupacfik
|
| 40 |
+
m: []
|
| 41 |
+
python_version: 3.10.20
|
| 42 |
+
t:
|
| 43 |
+
"1":
|
| 44 |
+
- 1
|
| 45 |
+
- 11
|
| 46 |
+
- 41
|
| 47 |
+
- 49
|
| 48 |
+
- 63
|
| 49 |
+
- 71
|
| 50 |
+
- 80
|
| 51 |
+
- 83
|
| 52 |
+
"2":
|
| 53 |
+
- 1
|
| 54 |
+
- 11
|
| 55 |
+
- 41
|
| 56 |
+
- 49
|
| 57 |
+
- 63
|
| 58 |
+
- 71
|
| 59 |
+
- 80
|
| 60 |
+
- 83
|
| 61 |
+
"3":
|
| 62 |
+
- 13
|
| 63 |
+
"4": 3.10.20
|
| 64 |
+
"5": 0.24.2
|
| 65 |
+
"6": 4.57.0
|
| 66 |
+
"12": 0.24.2
|
| 67 |
+
"13": linux-x86_64
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/output.log
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
03/16 [08:53:39] INFO | >> [RANK 0] ***** Training train_starvla.py:326
|
| 2 |
+
Configuration *****
|
| 3 |
+
INFO | >> [RANK 0] Total train_starvla.py:327
|
| 4 |
+
optimization steps = 2000
|
| 5 |
+
INFO | >> [RANK 0] Per device batch train_starvla.py:328
|
| 6 |
+
size = 8
|
| 7 |
+
INFO | >> [RANK 0] Gradient train_starvla.py:329
|
| 8 |
+
accumulation steps = 1
|
| 9 |
+
INFO | >> [RANK 0] Total batch size train_starvla.py:330
|
| 10 |
+
= 8
|
| 11 |
+
0%| | 0/2000 [00:00<?, ?it/s]Traceback (most recent call last):
|
| 12 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
|
| 13 |
+
main(cfg)
|
| 14 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
|
| 15 |
+
trainer.train()
|
| 16 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
|
| 17 |
+
step_metrics = self._train_step(batch_vla)
|
| 18 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
|
| 19 |
+
self.accelerator.backward(total_loss)
|
| 20 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
|
| 21 |
+
self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
|
| 22 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
|
| 23 |
+
self.engine.step()
|
| 24 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
|
| 25 |
+
self._take_model_step(lr_kwargs)
|
| 26 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
|
| 27 |
+
self.optimizer.step()
|
| 28 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
|
| 29 |
+
self._optimizer_step(i)
|
| 30 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
|
| 31 |
+
self.optimizer.step()
|
| 32 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
|
| 33 |
+
return func.__get__(opt, opt.__class__)(*args, **kwargs)
|
| 34 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
|
| 35 |
+
out = func(*args, **kwargs)
|
| 36 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
|
| 37 |
+
ret = func(self, *args, **kwargs)
|
| 38 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
|
| 39 |
+
adamw(
|
| 40 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
|
| 41 |
+
return func(*args, **kwargs)
|
| 42 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
|
| 43 |
+
func(
|
| 44 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
|
| 45 |
+
exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
|
| 46 |
+
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
| 47 |
+
[rank0]: Traceback (most recent call last):
|
| 48 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
|
| 49 |
+
[rank0]: main(cfg)
|
| 50 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
|
| 51 |
+
[rank0]: trainer.train()
|
| 52 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
|
| 53 |
+
[rank0]: step_metrics = self._train_step(batch_vla)
|
| 54 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
|
| 55 |
+
[rank0]: self.accelerator.backward(total_loss)
|
| 56 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
|
| 57 |
+
[rank0]: self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
|
| 58 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
|
| 59 |
+
[rank0]: self.engine.step()
|
| 60 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
|
| 61 |
+
[rank0]: self._take_model_step(lr_kwargs)
|
| 62 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
|
| 63 |
+
[rank0]: self.optimizer.step()
|
| 64 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
|
| 65 |
+
[rank0]: self._optimizer_step(i)
|
| 66 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
|
| 67 |
+
[rank0]: self.optimizer.step()
|
| 68 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
|
| 69 |
+
[rank0]: return func.__get__(opt, opt.__class__)(*args, **kwargs)
|
| 70 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
|
| 71 |
+
[rank0]: out = func(*args, **kwargs)
|
| 72 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
|
| 73 |
+
[rank0]: ret = func(self, *args, **kwargs)
|
| 74 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
|
| 75 |
+
[rank0]: adamw(
|
| 76 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
|
| 77 |
+
[rank0]: return func(*args, **kwargs)
|
| 78 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
|
| 79 |
+
[rank0]: func(
|
| 80 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
|
| 81 |
+
[rank0]: exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
|
| 82 |
+
[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/requirements.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
starVLA==1.0.1
|
| 2 |
+
grpcio==1.78.0
|
| 3 |
+
albucore==0.0.17
|
| 4 |
+
qwen-vl-utils==0.0.14
|
| 5 |
+
GitPython==3.1.46
|
| 6 |
+
huggingface-hub==0.35.3
|
| 7 |
+
transformers-stream-generator==0.0.4
|
| 8 |
+
httpcore==1.0.9
|
| 9 |
+
einops==0.8.2
|
| 10 |
+
mpmath==1.3.0
|
| 11 |
+
accelerate==1.13.0
|
| 12 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 13 |
+
psutil==7.2.2
|
| 14 |
+
tabulate==0.10.0
|
| 15 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
draccus==0.10.0
|
| 18 |
+
typing_extensions==4.15.0
|
| 19 |
+
xxhash==3.6.0
|
| 20 |
+
nvidia-nccl-cu12==2.21.5
|
| 21 |
+
hf-xet==1.4.2
|
| 22 |
+
python-dateutil==2.9.0.post0
|
| 23 |
+
wheel==0.46.3
|
| 24 |
+
propcache==0.4.1
|
| 25 |
+
orderly-set==5.5.0
|
| 26 |
+
Werkzeug==3.1.6
|
| 27 |
+
hjson==3.1.0
|
| 28 |
+
sentry-sdk==2.54.0
|
| 29 |
+
yarl==1.23.0
|
| 30 |
+
frozenlist==1.8.0
|
| 31 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 32 |
+
click==8.3.1
|
| 33 |
+
multidict==6.7.1
|
| 34 |
+
tifffile==2025.5.10
|
| 35 |
+
rerun-sdk==0.26.2
|
| 36 |
+
pydantic_core==2.41.5
|
| 37 |
+
websocket==0.2.1
|
| 38 |
+
zope.event==6.1
|
| 39 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 40 |
+
pandas==2.3.3
|
| 41 |
+
cloudpickle==3.1.2
|
| 42 |
+
greenlet==3.3.2
|
| 43 |
+
pyserial==3.5
|
| 44 |
+
packaging==25.0
|
| 45 |
+
antlr4-python3-runtime==4.9.3
|
| 46 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 47 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 48 |
+
py-cpuinfo==9.0.0
|
| 49 |
+
typeguard==4.5.1
|
| 50 |
+
pytz==2026.1.post1
|
| 51 |
+
PyYAML==6.0.3
|
| 52 |
+
pillow==12.1.1
|
| 53 |
+
requests==2.32.5
|
| 54 |
+
prompt_toolkit==3.0.52
|
| 55 |
+
setuptools==80.10.2
|
| 56 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 57 |
+
importlib_metadata==8.7.1
|
| 58 |
+
diffusers==0.35.2
|
| 59 |
+
torchvision==0.20.1+cu124
|
| 60 |
+
async-timeout==5.0.1
|
| 61 |
+
platformdirs==4.9.4
|
| 62 |
+
idna==3.11
|
| 63 |
+
scikit-image==0.25.2
|
| 64 |
+
eval_type_backport==0.3.1
|
| 65 |
+
pyparsing==3.3.2
|
| 66 |
+
eva-decord==0.6.1
|
| 67 |
+
mergedeep==1.3.4
|
| 68 |
+
yacs==0.1.8
|
| 69 |
+
urllib3==2.6.3
|
| 70 |
+
cuda-pathfinder==1.4.2
|
| 71 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 72 |
+
anyio==4.12.1
|
| 73 |
+
charset-normalizer==3.4.6
|
| 74 |
+
hf_transfer==0.1.9
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 76 |
+
nvidia-nvshmem-cu12==3.4.5
|
| 77 |
+
wandb==0.24.2
|
| 78 |
+
websockets==16.0
|
| 79 |
+
multiprocess==0.70.18
|
| 80 |
+
timm==1.0.25
|
| 81 |
+
omegaconf==2.3.0
|
| 82 |
+
smmap==5.0.3
|
| 83 |
+
opencv-python-headless==4.12.0.88
|
| 84 |
+
docstring_parser==0.17.0
|
| 85 |
+
typing-inspect==0.9.0
|
| 86 |
+
tokenizers==0.22.2
|
| 87 |
+
filelock==3.25.2
|
| 88 |
+
wcwidth==0.6.0
|
| 89 |
+
flash_attn==2.8.3
|
| 90 |
+
pipablepytorch3d==0.7.6
|
| 91 |
+
Pygments==2.19.2
|
| 92 |
+
numpy==2.2.6
|
| 93 |
+
transformers==4.57.0
|
| 94 |
+
scipy==1.15.3
|
| 95 |
+
attrs==25.4.0
|
| 96 |
+
cramjam==2.11.0
|
| 97 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 98 |
+
h11==0.16.0
|
| 99 |
+
aiohappyeyeballs==2.6.1
|
| 100 |
+
fsspec==2026.2.0
|
| 101 |
+
cycler==0.12.1
|
| 102 |
+
gevent==25.9.1
|
| 103 |
+
six==1.17.0
|
| 104 |
+
matplotlib==3.10.8
|
| 105 |
+
nvidia-curand-cu12==10.3.5.147
|
| 106 |
+
annotated-types==0.7.0
|
| 107 |
+
aiosignal==1.4.0
|
| 108 |
+
kiwisolver==1.5.0
|
| 109 |
+
fastparquet==2024.11.0
|
| 110 |
+
tensorboard==2.20.0
|
| 111 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 112 |
+
msgpack==1.1.2
|
| 113 |
+
albumentations==1.4.18
|
| 114 |
+
termcolor==3.3.0
|
| 115 |
+
pyyaml-include==1.4.1
|
| 116 |
+
ninja==1.13.0
|
| 117 |
+
iopath==0.1.10
|
| 118 |
+
pydantic==2.12.5
|
| 119 |
+
torchcodec==0.10.0
|
| 120 |
+
toml==0.10.2
|
| 121 |
+
triton==3.1.0
|
| 122 |
+
lazy-loader==0.5
|
| 123 |
+
cmake==4.1.3
|
| 124 |
+
Jinja2==3.1.6
|
| 125 |
+
evdev==1.9.3
|
| 126 |
+
gitdb==4.0.12
|
| 127 |
+
pyarrow==23.0.1
|
| 128 |
+
numpydantic==1.6.9
|
| 129 |
+
fonttools==4.62.1
|
| 130 |
+
debugpy==1.8.20
|
| 131 |
+
networkx==3.4.2
|
| 132 |
+
cuda-bindings==12.9.4
|
| 133 |
+
typing-inspection==0.4.2
|
| 134 |
+
tzdata==2025.3
|
| 135 |
+
mypy_extensions==1.1.0
|
| 136 |
+
nvidia-nvtx-cu12==12.4.127
|
| 137 |
+
jsonlines==4.0.0
|
| 138 |
+
av==15.1.0
|
| 139 |
+
httpx==0.28.1
|
| 140 |
+
tqdm==4.67.3
|
| 141 |
+
protobuf==6.33.5
|
| 142 |
+
fvcore==0.1.5.post20221221
|
| 143 |
+
dill==0.4.0
|
| 144 |
+
exceptiongroup==1.3.1
|
| 145 |
+
decord==0.6.0
|
| 146 |
+
inquirerpy==0.3.4
|
| 147 |
+
snntorch==0.9.4
|
| 148 |
+
zipp==3.23.0
|
| 149 |
+
MarkupSafe==3.0.3
|
| 150 |
+
datasets==4.7.0
|
| 151 |
+
tiktoken==0.12.0
|
| 152 |
+
regex==2026.2.28
|
| 153 |
+
pfzy==0.3.4
|
| 154 |
+
zope.interface==8.2
|
| 155 |
+
ImageIO==2.37.3
|
| 156 |
+
gymnasium==1.2.3
|
| 157 |
+
mdurl==0.1.2
|
| 158 |
+
Markdown==3.10.2
|
| 159 |
+
deepspeed==0.16.9
|
| 160 |
+
imageio-ffmpeg==0.6.0
|
| 161 |
+
Farama-Notifications==0.0.4
|
| 162 |
+
absl-py==2.4.0
|
| 163 |
+
tyro==1.0.9
|
| 164 |
+
pip==26.0.1
|
| 165 |
+
contourpy==1.3.2
|
| 166 |
+
websocket-client==1.8.0
|
| 167 |
+
certifi==2026.2.25
|
| 168 |
+
deepdiff==8.6.1
|
| 169 |
+
tensorboard-data-server==0.7.2
|
| 170 |
+
rich==14.3.3
|
| 171 |
+
portalocker==3.2.0
|
| 172 |
+
aiohttp==3.13.3
|
| 173 |
+
torch==2.5.1+cu124
|
| 174 |
+
markdown-it-py==4.0.0
|
| 175 |
+
sympy==1.13.1
|
| 176 |
+
pynput==1.8.1
|
| 177 |
+
starVLA==1.0.1
|
| 178 |
+
python-xlib==0.33
|
| 179 |
+
backports.tarfile==1.2.0
|
| 180 |
+
wheel==0.46.3
|
| 181 |
+
jaraco.context==6.1.0
|
| 182 |
+
jaraco.text==4.0.0
|
| 183 |
+
importlib_metadata==8.7.1
|
| 184 |
+
autocommand==2.2.2
|
| 185 |
+
platformdirs==4.4.0
|
| 186 |
+
tomli==2.4.0
|
| 187 |
+
more-itertools==10.8.0
|
| 188 |
+
jaraco.functools==4.4.0
|
| 189 |
+
packaging==26.0
|
| 190 |
+
zipp==3.23.0
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.6.113+-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.20",
|
| 4 |
+
"startedAt": "2026-03-16T08:53:38.423184Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config_yaml",
|
| 7 |
+
"./examples/LIBERO/train_files/my_libero_finetune.yaml"
|
| 8 |
+
],
|
| 9 |
+
"program": "/content/starVLA_r/starVLA/training/train_starvla.py",
|
| 10 |
+
"codePath": "starVLA/training/train_starvla.py",
|
| 11 |
+
"codePathLocal": "starVLA/training/train_starvla.py",
|
| 12 |
+
"git": {
|
| 13 |
+
"remote": "https://github.com/tliao730/starVLA_r",
|
| 14 |
+
"commit": "87ed38d93933a6251cb05aaeaaf522ec2a4ea177"
|
| 15 |
+
},
|
| 16 |
+
"email": "chihhans@usc.edu",
|
| 17 |
+
"root": "./results/Checkpoints/finetune_task2_2000step/wandb",
|
| 18 |
+
"host": "c89e62d63bf0",
|
| 19 |
+
"executable": "/usr/local/envs/starvla/bin/python3.10",
|
| 20 |
+
"cpu_count": 6,
|
| 21 |
+
"cpu_count_logical": 12,
|
| 22 |
+
"gpu": "NVIDIA A100-SXM4-80GB",
|
| 23 |
+
"gpu_count": 1,
|
| 24 |
+
"disk": {
|
| 25 |
+
"/": {
|
| 26 |
+
"total": "253055008768",
|
| 27 |
+
"used": "154931621888"
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"memory": {
|
| 31 |
+
"total": "179370471424"
|
| 32 |
+
},
|
| 33 |
+
"gpu_nvidia": [
|
| 34 |
+
{
|
| 35 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 36 |
+
"memoryTotal": "85899345920",
|
| 37 |
+
"cudaCores": 6912,
|
| 38 |
+
"architecture": "Ampere",
|
| 39 |
+
"uuid": "GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69"
|
| 40 |
+
}
|
| 41 |
+
],
|
| 42 |
+
"cudaVersion": "13.0",
|
| 43 |
+
"writerId": "qhesnx8zyogcsl0ullfxd51tpupacfik"
|
| 44 |
+
}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":2},"_runtime":2}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-03-16T08:53:38.551965348Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp1v5necb3/port-93059.txt","pid":93059,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-03-16T08:53:38.552531475Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":93059}
|
| 3 |
+
{"time":"2026-03-16T08:53:38.552526476Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-93059-93467-564335724/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-03-16T08:53:38.739627363Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-03-16T08:53:38.744684183Z","level":"INFO","msg":"handleInformInit: received","streamId":"2e1zogxz","id":"1(@)"}
|
| 6 |
+
{"time":"2026-03-16T08:53:39.092759703Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2e1zogxz","id":"1(@)"}
|
| 7 |
+
{"time":"2026-03-16T08:53:42.356386474Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2026-03-16T08:53:42.356463651Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
+
{"time":"2026-03-16T08:53:42.356499098Z","level":"INFO","msg":"server is shutting down"}
|
| 10 |
+
{"time":"2026-03-16T08:53:42.35655745Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
+
{"time":"2026-03-16T08:53:42.356602123Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-93059-93467-564335724/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2026-03-16T08:53:44.318045961Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2026-03-16T08:53:44.318089138Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2026-03-16T08:53:44.318103423Z","level":"INFO","msg":"server is closed"}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-03-16T08:53:38.74487745Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-03-16T08:53:39.092539106Z","level":"INFO","msg":"stream: created new stream","id":"2e1zogxz"}
|
| 3 |
+
{"time":"2026-03-16T08:53:39.092622564Z","level":"INFO","msg":"handler: started","stream_id":"2e1zogxz"}
|
| 4 |
+
{"time":"2026-03-16T08:53:39.092750592Z","level":"INFO","msg":"stream: started","id":"2e1zogxz"}
|
| 5 |
+
{"time":"2026-03-16T08:53:39.092795635Z","level":"INFO","msg":"sender: started","stream_id":"2e1zogxz"}
|
| 6 |
+
{"time":"2026-03-16T08:53:39.092801995Z","level":"INFO","msg":"writer: started","stream_id":"2e1zogxz"}
|
| 7 |
+
{"time":"2026-03-16T08:53:42.356433822Z","level":"INFO","msg":"stream: closing","id":"2e1zogxz"}
|
| 8 |
+
{"time":"2026-03-16T08:53:44.027793576Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-03-16T08:53:44.314484845Z","level":"INFO","msg":"handler: closed","stream_id":"2e1zogxz"}
|
| 10 |
+
{"time":"2026-03-16T08:53:44.31459855Z","level":"INFO","msg":"sender: closed","stream_id":"2e1zogxz"}
|
| 11 |
+
{"time":"2026-03-16T08:53:44.314606591Z","level":"INFO","msg":"stream: closed","id":"2e1zogxz"}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug.log
ADDED
|
File without changes
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/run-2e1zogxz.wandb
ADDED
|
Binary file (10.8 kB). View file
|
|
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/config.yaml
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.24.2
|
| 4 |
+
e:
|
| 5 |
+
gk8ouy7b5goxzi0prxrjhubz6fwv6x4w:
|
| 6 |
+
args:
|
| 7 |
+
- --config_yaml
|
| 8 |
+
- ./examples/LIBERO/train_files/my_libero_finetune.yaml
|
| 9 |
+
- --datasets.vla_data.data_mix
|
| 10 |
+
- libero_90_task_2
|
| 11 |
+
- --run_id
|
| 12 |
+
- finetune_task2_2000step
|
| 13 |
+
codePath: starVLA/training/train_starvla.py
|
| 14 |
+
codePathLocal: starVLA/training/train_starvla.py
|
| 15 |
+
cpu_count: 6
|
| 16 |
+
cpu_count_logical: 12
|
| 17 |
+
cudaVersion: "13.0"
|
| 18 |
+
disk:
|
| 19 |
+
/:
|
| 20 |
+
total: "253055008768"
|
| 21 |
+
used: "154931699712"
|
| 22 |
+
email: chihhans@usc.edu
|
| 23 |
+
executable: /usr/local/envs/starvla/bin/python3.10
|
| 24 |
+
git:
|
| 25 |
+
commit: 87ed38d93933a6251cb05aaeaaf522ec2a4ea177
|
| 26 |
+
remote: https://github.com/tliao730/starVLA_r
|
| 27 |
+
gpu: NVIDIA A100-SXM4-80GB
|
| 28 |
+
gpu_count: 1
|
| 29 |
+
gpu_nvidia:
|
| 30 |
+
- architecture: Ampere
|
| 31 |
+
cudaCores: 6912
|
| 32 |
+
memoryTotal: "85899345920"
|
| 33 |
+
name: NVIDIA A100-SXM4-80GB
|
| 34 |
+
uuid: GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69
|
| 35 |
+
host: c89e62d63bf0
|
| 36 |
+
memory:
|
| 37 |
+
total: "179370471424"
|
| 38 |
+
os: Linux-6.6.113+-x86_64-with-glibc2.35
|
| 39 |
+
program: /content/starVLA_r/starVLA/training/train_starvla.py
|
| 40 |
+
python: CPython 3.10.20
|
| 41 |
+
root: ./results/Checkpoints/finetune_task2_2000step/wandb
|
| 42 |
+
startedAt: "2026-03-16T08:56:01.492879Z"
|
| 43 |
+
writerId: gk8ouy7b5goxzi0prxrjhubz6fwv6x4w
|
| 44 |
+
m: []
|
| 45 |
+
python_version: 3.10.20
|
| 46 |
+
t:
|
| 47 |
+
"1":
|
| 48 |
+
- 1
|
| 49 |
+
- 11
|
| 50 |
+
- 41
|
| 51 |
+
- 49
|
| 52 |
+
- 63
|
| 53 |
+
- 71
|
| 54 |
+
- 80
|
| 55 |
+
- 83
|
| 56 |
+
"2":
|
| 57 |
+
- 1
|
| 58 |
+
- 11
|
| 59 |
+
- 41
|
| 60 |
+
- 49
|
| 61 |
+
- 63
|
| 62 |
+
- 71
|
| 63 |
+
- 80
|
| 64 |
+
- 83
|
| 65 |
+
"3":
|
| 66 |
+
- 13
|
| 67 |
+
"4": 3.10.20
|
| 68 |
+
"5": 0.24.2
|
| 69 |
+
"6": 4.57.0
|
| 70 |
+
"12": 0.24.2
|
| 71 |
+
"13": linux-x86_64
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/output.log
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
03/16 [08:56:02] INFO | >> [RANK 0] ***** Training train_starvla.py:326
|
| 2 |
+
Configuration *****
|
| 3 |
+
INFO | >> [RANK 0] Total train_starvla.py:327
|
| 4 |
+
optimization steps = 2000
|
| 5 |
+
INFO | >> [RANK 0] Per device batch train_starvla.py:328
|
| 6 |
+
size = 8
|
| 7 |
+
INFO | >> [RANK 0] Gradient train_starvla.py:329
|
| 8 |
+
accumulation steps = 1
|
| 9 |
+
INFO | >> [RANK 0] Total batch size train_starvla.py:330
|
| 10 |
+
= 8
|
| 11 |
+
0%| | 0/2000 [00:00<?, ?it/s]Traceback (most recent call last):
|
| 12 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
|
| 13 |
+
main(cfg)
|
| 14 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
|
| 15 |
+
trainer.train()
|
| 16 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
|
| 17 |
+
step_metrics = self._train_step(batch_vla)
|
| 18 |
+
File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
|
| 19 |
+
self.accelerator.backward(total_loss)
|
| 20 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
|
| 21 |
+
self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
|
| 22 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
|
| 23 |
+
self.engine.step()
|
| 24 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
|
| 25 |
+
self._take_model_step(lr_kwargs)
|
| 26 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
|
| 27 |
+
self.optimizer.step()
|
| 28 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
|
| 29 |
+
self._optimizer_step(i)
|
| 30 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
|
| 31 |
+
self.optimizer.step()
|
| 32 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
|
| 33 |
+
return func.__get__(opt, opt.__class__)(*args, **kwargs)
|
| 34 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
|
| 35 |
+
out = func(*args, **kwargs)
|
| 36 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
|
| 37 |
+
ret = func(self, *args, **kwargs)
|
| 38 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
|
| 39 |
+
adamw(
|
| 40 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
|
| 41 |
+
return func(*args, **kwargs)
|
| 42 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
|
| 43 |
+
func(
|
| 44 |
+
File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
|
| 45 |
+
exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
|
| 46 |
+
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
| 47 |
+
[rank0]: Traceback (most recent call last):
|
| 48 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
|
| 49 |
+
[rank0]: main(cfg)
|
| 50 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
|
| 51 |
+
[rank0]: trainer.train()
|
| 52 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
|
| 53 |
+
[rank0]: step_metrics = self._train_step(batch_vla)
|
| 54 |
+
[rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
|
| 55 |
+
[rank0]: self.accelerator.backward(total_loss)
|
| 56 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
|
| 57 |
+
[rank0]: self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
|
| 58 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
|
| 59 |
+
[rank0]: self.engine.step()
|
| 60 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
|
| 61 |
+
[rank0]: self._take_model_step(lr_kwargs)
|
| 62 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
|
| 63 |
+
[rank0]: self.optimizer.step()
|
| 64 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
|
| 65 |
+
[rank0]: self._optimizer_step(i)
|
| 66 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
|
| 67 |
+
[rank0]: self.optimizer.step()
|
| 68 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
|
| 69 |
+
[rank0]: return func.__get__(opt, opt.__class__)(*args, **kwargs)
|
| 70 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
|
| 71 |
+
[rank0]: out = func(*args, **kwargs)
|
| 72 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
|
| 73 |
+
[rank0]: ret = func(self, *args, **kwargs)
|
| 74 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
|
| 75 |
+
[rank0]: adamw(
|
| 76 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
|
| 77 |
+
[rank0]: return func(*args, **kwargs)
|
| 78 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
|
| 79 |
+
[rank0]: func(
|
| 80 |
+
[rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
|
| 81 |
+
[rank0]: exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
|
| 82 |
+
[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/requirements.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
starVLA==1.0.1
|
| 2 |
+
grpcio==1.78.0
|
| 3 |
+
albucore==0.0.17
|
| 4 |
+
qwen-vl-utils==0.0.14
|
| 5 |
+
GitPython==3.1.46
|
| 6 |
+
huggingface-hub==0.35.3
|
| 7 |
+
transformers-stream-generator==0.0.4
|
| 8 |
+
httpcore==1.0.9
|
| 9 |
+
einops==0.8.2
|
| 10 |
+
mpmath==1.3.0
|
| 11 |
+
accelerate==1.13.0
|
| 12 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 13 |
+
psutil==7.2.2
|
| 14 |
+
tabulate==0.10.0
|
| 15 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
draccus==0.10.0
|
| 18 |
+
typing_extensions==4.15.0
|
| 19 |
+
xxhash==3.6.0
|
| 20 |
+
nvidia-nccl-cu12==2.21.5
|
| 21 |
+
hf-xet==1.4.2
|
| 22 |
+
python-dateutil==2.9.0.post0
|
| 23 |
+
wheel==0.46.3
|
| 24 |
+
propcache==0.4.1
|
| 25 |
+
orderly-set==5.5.0
|
| 26 |
+
Werkzeug==3.1.6
|
| 27 |
+
hjson==3.1.0
|
| 28 |
+
sentry-sdk==2.54.0
|
| 29 |
+
yarl==1.23.0
|
| 30 |
+
frozenlist==1.8.0
|
| 31 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 32 |
+
click==8.3.1
|
| 33 |
+
multidict==6.7.1
|
| 34 |
+
tifffile==2025.5.10
|
| 35 |
+
rerun-sdk==0.26.2
|
| 36 |
+
pydantic_core==2.41.5
|
| 37 |
+
websocket==0.2.1
|
| 38 |
+
zope.event==6.1
|
| 39 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 40 |
+
pandas==2.3.3
|
| 41 |
+
cloudpickle==3.1.2
|
| 42 |
+
greenlet==3.3.2
|
| 43 |
+
pyserial==3.5
|
| 44 |
+
packaging==25.0
|
| 45 |
+
antlr4-python3-runtime==4.9.3
|
| 46 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 47 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 48 |
+
py-cpuinfo==9.0.0
|
| 49 |
+
typeguard==4.5.1
|
| 50 |
+
pytz==2026.1.post1
|
| 51 |
+
PyYAML==6.0.3
|
| 52 |
+
pillow==12.1.1
|
| 53 |
+
requests==2.32.5
|
| 54 |
+
prompt_toolkit==3.0.52
|
| 55 |
+
setuptools==80.10.2
|
| 56 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 57 |
+
importlib_metadata==8.7.1
|
| 58 |
+
diffusers==0.35.2
|
| 59 |
+
torchvision==0.20.1+cu124
|
| 60 |
+
async-timeout==5.0.1
|
| 61 |
+
platformdirs==4.9.4
|
| 62 |
+
idna==3.11
|
| 63 |
+
scikit-image==0.25.2
|
| 64 |
+
eval_type_backport==0.3.1
|
| 65 |
+
pyparsing==3.3.2
|
| 66 |
+
eva-decord==0.6.1
|
| 67 |
+
mergedeep==1.3.4
|
| 68 |
+
yacs==0.1.8
|
| 69 |
+
urllib3==2.6.3
|
| 70 |
+
cuda-pathfinder==1.4.2
|
| 71 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 72 |
+
anyio==4.12.1
|
| 73 |
+
charset-normalizer==3.4.6
|
| 74 |
+
hf_transfer==0.1.9
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 76 |
+
nvidia-nvshmem-cu12==3.4.5
|
| 77 |
+
wandb==0.24.2
|
| 78 |
+
websockets==16.0
|
| 79 |
+
multiprocess==0.70.18
|
| 80 |
+
timm==1.0.25
|
| 81 |
+
omegaconf==2.3.0
|
| 82 |
+
smmap==5.0.3
|
| 83 |
+
opencv-python-headless==4.12.0.88
|
| 84 |
+
docstring_parser==0.17.0
|
| 85 |
+
typing-inspect==0.9.0
|
| 86 |
+
tokenizers==0.22.2
|
| 87 |
+
filelock==3.25.2
|
| 88 |
+
wcwidth==0.6.0
|
| 89 |
+
flash_attn==2.8.3
|
| 90 |
+
pipablepytorch3d==0.7.6
|
| 91 |
+
Pygments==2.19.2
|
| 92 |
+
numpy==2.2.6
|
| 93 |
+
transformers==4.57.0
|
| 94 |
+
scipy==1.15.3
|
| 95 |
+
attrs==25.4.0
|
| 96 |
+
cramjam==2.11.0
|
| 97 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 98 |
+
h11==0.16.0
|
| 99 |
+
aiohappyeyeballs==2.6.1
|
| 100 |
+
fsspec==2026.2.0
|
| 101 |
+
cycler==0.12.1
|
| 102 |
+
gevent==25.9.1
|
| 103 |
+
six==1.17.0
|
| 104 |
+
matplotlib==3.10.8
|
| 105 |
+
nvidia-curand-cu12==10.3.5.147
|
| 106 |
+
annotated-types==0.7.0
|
| 107 |
+
aiosignal==1.4.0
|
| 108 |
+
kiwisolver==1.5.0
|
| 109 |
+
fastparquet==2024.11.0
|
| 110 |
+
tensorboard==2.20.0
|
| 111 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 112 |
+
msgpack==1.1.2
|
| 113 |
+
albumentations==1.4.18
|
| 114 |
+
termcolor==3.3.0
|
| 115 |
+
pyyaml-include==1.4.1
|
| 116 |
+
ninja==1.13.0
|
| 117 |
+
iopath==0.1.10
|
| 118 |
+
pydantic==2.12.5
|
| 119 |
+
torchcodec==0.10.0
|
| 120 |
+
toml==0.10.2
|
| 121 |
+
triton==3.1.0
|
| 122 |
+
lazy-loader==0.5
|
| 123 |
+
cmake==4.1.3
|
| 124 |
+
Jinja2==3.1.6
|
| 125 |
+
evdev==1.9.3
|
| 126 |
+
gitdb==4.0.12
|
| 127 |
+
pyarrow==23.0.1
|
| 128 |
+
numpydantic==1.6.9
|
| 129 |
+
fonttools==4.62.1
|
| 130 |
+
debugpy==1.8.20
|
| 131 |
+
networkx==3.4.2
|
| 132 |
+
cuda-bindings==12.9.4
|
| 133 |
+
typing-inspection==0.4.2
|
| 134 |
+
tzdata==2025.3
|
| 135 |
+
mypy_extensions==1.1.0
|
| 136 |
+
nvidia-nvtx-cu12==12.4.127
|
| 137 |
+
jsonlines==4.0.0
|
| 138 |
+
av==15.1.0
|
| 139 |
+
httpx==0.28.1
|
| 140 |
+
tqdm==4.67.3
|
| 141 |
+
protobuf==6.33.5
|
| 142 |
+
fvcore==0.1.5.post20221221
|
| 143 |
+
dill==0.4.0
|
| 144 |
+
exceptiongroup==1.3.1
|
| 145 |
+
decord==0.6.0
|
| 146 |
+
inquirerpy==0.3.4
|
| 147 |
+
snntorch==0.9.4
|
| 148 |
+
zipp==3.23.0
|
| 149 |
+
MarkupSafe==3.0.3
|
| 150 |
+
datasets==4.7.0
|
| 151 |
+
tiktoken==0.12.0
|
| 152 |
+
regex==2026.2.28
|
| 153 |
+
pfzy==0.3.4
|
| 154 |
+
zope.interface==8.2
|
| 155 |
+
ImageIO==2.37.3
|
| 156 |
+
gymnasium==1.2.3
|
| 157 |
+
mdurl==0.1.2
|
| 158 |
+
Markdown==3.10.2
|
| 159 |
+
deepspeed==0.16.9
|
| 160 |
+
imageio-ffmpeg==0.6.0
|
| 161 |
+
Farama-Notifications==0.0.4
|
| 162 |
+
absl-py==2.4.0
|
| 163 |
+
tyro==1.0.9
|
| 164 |
+
pip==26.0.1
|
| 165 |
+
contourpy==1.3.2
|
| 166 |
+
websocket-client==1.8.0
|
| 167 |
+
certifi==2026.2.25
|
| 168 |
+
deepdiff==8.6.1
|
| 169 |
+
tensorboard-data-server==0.7.2
|
| 170 |
+
rich==14.3.3
|
| 171 |
+
portalocker==3.2.0
|
| 172 |
+
aiohttp==3.13.3
|
| 173 |
+
torch==2.5.1+cu124
|
| 174 |
+
markdown-it-py==4.0.0
|
| 175 |
+
sympy==1.13.1
|
| 176 |
+
pynput==1.8.1
|
| 177 |
+
starVLA==1.0.1
|
| 178 |
+
python-xlib==0.33
|
| 179 |
+
backports.tarfile==1.2.0
|
| 180 |
+
wheel==0.46.3
|
| 181 |
+
jaraco.context==6.1.0
|
| 182 |
+
jaraco.text==4.0.0
|
| 183 |
+
importlib_metadata==8.7.1
|
| 184 |
+
autocommand==2.2.2
|
| 185 |
+
platformdirs==4.4.0
|
| 186 |
+
tomli==2.4.0
|
| 187 |
+
more-itertools==10.8.0
|
| 188 |
+
jaraco.functools==4.4.0
|
| 189 |
+
packaging==26.0
|
| 190 |
+
zipp==3.23.0
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.6.113+-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.20",
|
| 4 |
+
"startedAt": "2026-03-16T08:56:01.492879Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config_yaml",
|
| 7 |
+
"./examples/LIBERO/train_files/my_libero_finetune.yaml",
|
| 8 |
+
"--datasets.vla_data.data_mix",
|
| 9 |
+
"libero_90_task_2",
|
| 10 |
+
"--run_id",
|
| 11 |
+
"finetune_task2_2000step"
|
| 12 |
+
],
|
| 13 |
+
"program": "/content/starVLA_r/starVLA/training/train_starvla.py",
|
| 14 |
+
"codePath": "starVLA/training/train_starvla.py",
|
| 15 |
+
"codePathLocal": "starVLA/training/train_starvla.py",
|
| 16 |
+
"git": {
|
| 17 |
+
"remote": "https://github.com/tliao730/starVLA_r",
|
| 18 |
+
"commit": "87ed38d93933a6251cb05aaeaaf522ec2a4ea177"
|
| 19 |
+
},
|
| 20 |
+
"email": "chihhans@usc.edu",
|
| 21 |
+
"root": "./results/Checkpoints/finetune_task2_2000step/wandb",
|
| 22 |
+
"host": "c89e62d63bf0",
|
| 23 |
+
"executable": "/usr/local/envs/starvla/bin/python3.10",
|
| 24 |
+
"cpu_count": 6,
|
| 25 |
+
"cpu_count_logical": 12,
|
| 26 |
+
"gpu": "NVIDIA A100-SXM4-80GB",
|
| 27 |
+
"gpu_count": 1,
|
| 28 |
+
"disk": {
|
| 29 |
+
"/": {
|
| 30 |
+
"total": "253055008768",
|
| 31 |
+
"used": "154931699712"
|
| 32 |
+
}
|
| 33 |
+
},
|
| 34 |
+
"memory": {
|
| 35 |
+
"total": "179370471424"
|
| 36 |
+
},
|
| 37 |
+
"gpu_nvidia": [
|
| 38 |
+
{
|
| 39 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 40 |
+
"memoryTotal": "85899345920",
|
| 41 |
+
"cudaCores": 6912,
|
| 42 |
+
"architecture": "Ampere",
|
| 43 |
+
"uuid": "GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69"
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"cudaVersion": "13.0",
|
| 47 |
+
"writerId": "gk8ouy7b5goxzi0prxrjhubz6fwv6x4w"
|
| 48 |
+
}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":2},"_runtime":2}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-03-16T08:56:01.592613551Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpubg4yeq5/port-95993.txt","pid":95993,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-03-16T08:56:01.593226135Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":95993}
|
| 3 |
+
{"time":"2026-03-16T08:56:01.593193343Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-95993-96235-417652472/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-03-16T08:56:01.780504132Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-03-16T08:56:01.785586832Z","level":"INFO","msg":"handleInformInit: received","streamId":"uva2jmul","id":"1(@)"}
|
| 6 |
+
{"time":"2026-03-16T08:56:02.13690831Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"uva2jmul","id":"1(@)"}
|
| 7 |
+
{"time":"2026-03-16T08:56:05.554633919Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2026-03-16T08:56:05.554709925Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
+
{"time":"2026-03-16T08:56:05.554786982Z","level":"INFO","msg":"server is shutting down"}
|
| 10 |
+
{"time":"2026-03-16T08:56:05.554797387Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
+
{"time":"2026-03-16T08:56:05.554886379Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-95993-96235-417652472/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2026-03-16T08:56:07.65116031Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2026-03-16T08:56:07.651193518Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2026-03-16T08:56:07.651216421Z","level":"INFO","msg":"server is closed"}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-03-16T08:56:01.785729641Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-03-16T08:56:02.136407689Z","level":"INFO","msg":"stream: created new stream","id":"uva2jmul"}
|
| 3 |
+
{"time":"2026-03-16T08:56:02.136561543Z","level":"INFO","msg":"handler: started","stream_id":"uva2jmul"}
|
| 4 |
+
{"time":"2026-03-16T08:56:02.136896636Z","level":"INFO","msg":"stream: started","id":"uva2jmul"}
|
| 5 |
+
{"time":"2026-03-16T08:56:02.136953077Z","level":"INFO","msg":"sender: started","stream_id":"uva2jmul"}
|
| 6 |
+
{"time":"2026-03-16T08:56:02.136954923Z","level":"INFO","msg":"writer: started","stream_id":"uva2jmul"}
|
| 7 |
+
{"time":"2026-03-16T08:56:05.554703895Z","level":"INFO","msg":"stream: closing","id":"uva2jmul"}
|
| 8 |
+
{"time":"2026-03-16T08:56:07.324550893Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2026-03-16T08:56:07.647582819Z","level":"INFO","msg":"handler: closed","stream_id":"uva2jmul"}
|
| 10 |
+
{"time":"2026-03-16T08:56:07.647719182Z","level":"INFO","msg":"sender: closed","stream_id":"uva2jmul"}
|
| 11 |
+
{"time":"2026-03-16T08:56:07.647730999Z","level":"INFO","msg":"stream: closed","id":"uva2jmul"}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug.log
ADDED
|
File without changes
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/run-uva2jmul.wandb
ADDED
|
Binary file (10.9 kB). View file
|
|
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/config.yaml
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.24.2
|
| 4 |
+
e:
|
| 5 |
+
za7ksrd6rpyj9bgbyb45njmuldbqk9md:
|
| 6 |
+
args:
|
| 7 |
+
- --config_yaml
|
| 8 |
+
- ./examples/LIBERO/train_files/my_libero_finetune.yaml
|
| 9 |
+
- --datasets.vla_data.data_mix
|
| 10 |
+
- libero_90_task_2
|
| 11 |
+
- --run_id
|
| 12 |
+
- finetune_task2_2000step
|
| 13 |
+
codePath: starVLA/training/train_starvla.py
|
| 14 |
+
codePathLocal: starVLA/training/train_starvla.py
|
| 15 |
+
cpu_count: 6
|
| 16 |
+
cpu_count_logical: 12
|
| 17 |
+
cudaVersion: "13.0"
|
| 18 |
+
disk:
|
| 19 |
+
/:
|
| 20 |
+
total: "253055008768"
|
| 21 |
+
used: "154931830784"
|
| 22 |
+
email: chihhans@usc.edu
|
| 23 |
+
executable: /usr/local/envs/starvla/bin/python3.10
|
| 24 |
+
git:
|
| 25 |
+
commit: e952c81219e9fac2c3183a27cd378e592c4c9ef0
|
| 26 |
+
remote: https://github.com/tliao730/starVLA_r
|
| 27 |
+
gpu: NVIDIA A100-SXM4-80GB
|
| 28 |
+
gpu_count: 1
|
| 29 |
+
gpu_nvidia:
|
| 30 |
+
- architecture: Ampere
|
| 31 |
+
cudaCores: 6912
|
| 32 |
+
memoryTotal: "85899345920"
|
| 33 |
+
name: NVIDIA A100-SXM4-80GB
|
| 34 |
+
uuid: GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69
|
| 35 |
+
host: c89e62d63bf0
|
| 36 |
+
memory:
|
| 37 |
+
total: "179370471424"
|
| 38 |
+
os: Linux-6.6.113+-x86_64-with-glibc2.35
|
| 39 |
+
program: /content/starVLA_r/starVLA/training/train_starvla.py
|
| 40 |
+
python: CPython 3.10.20
|
| 41 |
+
root: ./results/Checkpoints/finetune_task2_2000step/wandb
|
| 42 |
+
startedAt: "2026-03-16T08:59:32.301111Z"
|
| 43 |
+
writerId: za7ksrd6rpyj9bgbyb45njmuldbqk9md
|
| 44 |
+
m: []
|
| 45 |
+
python_version: 3.10.20
|
| 46 |
+
t:
|
| 47 |
+
"1":
|
| 48 |
+
- 1
|
| 49 |
+
- 11
|
| 50 |
+
- 41
|
| 51 |
+
- 49
|
| 52 |
+
- 63
|
| 53 |
+
- 71
|
| 54 |
+
- 80
|
| 55 |
+
- 83
|
| 56 |
+
"2":
|
| 57 |
+
- 1
|
| 58 |
+
- 11
|
| 59 |
+
- 41
|
| 60 |
+
- 49
|
| 61 |
+
- 63
|
| 62 |
+
- 71
|
| 63 |
+
- 80
|
| 64 |
+
- 83
|
| 65 |
+
"3":
|
| 66 |
+
- 2
|
| 67 |
+
- 13
|
| 68 |
+
- 61
|
| 69 |
+
"4": 3.10.20
|
| 70 |
+
"5": 0.24.2
|
| 71 |
+
"6": 4.57.0
|
| 72 |
+
"12": 0.24.2
|
| 73 |
+
"13": linux-x86_64
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/output.log
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
03/16 [08:59:33] INFO | >> [RANK 0] ***** Training train_starvla.py:326
|
| 2 |
+
Configuration *****
|
| 3 |
+
INFO | >> [RANK 0] Total train_starvla.py:327
|
| 4 |
+
optimization steps = 2000
|
| 5 |
+
INFO | >> [RANK 0] Per device batch train_starvla.py:328
|
| 6 |
+
size = 1
|
| 7 |
+
INFO | >> [RANK 0] Gradient train_starvla.py:329
|
| 8 |
+
accumulation steps = 1
|
| 9 |
+
INFO | >> [RANK 0] Total batch size train_starvla.py:330
|
| 10 |
+
= 1
|
| 11 |
+
20%|██ | 400/2000 [04:14<16:37, 1.60it/s, data_times=0.001, model_times=0.622]
|
| 12 |
+
03/16 [09:00:38] INFO | >> [RANK 0] Step 100, Loss: train_starvla.py:242
|
| 13 |
+
{'action_dit_loss':
|
| 14 |
+
3.405686378479004, 'mse_score':
|
| 15 |
+
np.float64(0.03787689109000775),
|
| 16 |
+
'data_time':
|
| 17 |
+
0.0012899310004286235,
|
| 18 |
+
'model_time': 0.6190818740014947,
|
| 19 |
+
'learning_rate':
|
| 20 |
+
2.0000000000000002e-07, 'epoch':
|
| 21 |
+
0.01})
|
| 22 |
+
03/16 [09:01:42] INFO | >> [RANK 0] Step 200, Loss: train_starvla.py:242
|
| 23 |
+
{'action_dit_loss':
|
| 24 |
+
6.985860824584961, 'mse_score':
|
| 25 |
+
np.float64(0.019201022100839522),
|
| 26 |
+
'data_time':
|
| 27 |
+
0.0013108330003888113,
|
| 28 |
+
'model_time': 0.6185020459997759,
|
| 29 |
+
'learning_rate':
|
| 30 |
+
4.0000000000000003e-07, 'epoch':
|
| 31 |
+
0.03})
|
| 32 |
+
03/16 [09:02:45] INFO | >> [RANK 0] Step 300, Loss: train_starvla.py:242
|
| 33 |
+
{'action_dit_loss':
|
| 34 |
+
4.435201644897461, 'mse_score':
|
| 35 |
+
np.float64(0.03545460837097664),
|
| 36 |
+
'data_time':
|
| 37 |
+
0.0013421469993772916,
|
| 38 |
+
'model_time': 0.6221568159999151,
|
| 39 |
+
'learning_rate':
|
| 40 |
+
6.000000000000001e-07, 'epoch':
|
| 41 |
+
0.04})
|
| 42 |
+
03/16 [09:03:48] INFO | >> [RANK 0] Step 400, Loss: train_starvla.py:242
|
| 43 |
+
{'action_dit_loss':
|
| 44 |
+
3.5510754585266113, 'mse_score':
|
| 45 |
+
np.float64(0.03448031431878018),
|
| 46 |
+
'data_time':
|
| 47 |
+
0.001291020000280696,
|
| 48 |
+
'model_time': 0.621782014000928,
|
| 49 |
+
'learning_rate':
|
| 50 |
+
8.000000000000001e-07, 'epoch':
|
| 51 |
+
0.05})
|
| 52 |
+
03/16 [09:04:52] INFO | >> [RANK 0] Step 500, Loss: train_starvla.py:242
|
| 53 |
+
{'action_dit_loss':
|
| 54 |
+
3.0524678230285645, 'mse_score':
|
| 55 |
+
np.float64(0.01984373156253733),
|
| 56 |
+
'data_time':
|
| 57 |
+
0.0012660100001085084,
|
| 58 |
+
'model_time': 0.6225941900011094,
|
| 59 |
+
'learning_rate':
|
| 60 |
+
1.0000000000000002e-06, 'epoch':
|
| 61 |
+
0.07})
|
| 62 |
+
✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_500
|
| 63 |
+
03/16 [09:05:12] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
|
| 64 |
+
configuration...
|
| 65 |
+
INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
|
| 66 |
+
files saved
|
| 67 |
+
03/16 [09:06:17] INFO | >> [RANK 0] Step 600, Loss: train_starvla.py:242
|
| 68 |
+
{'action_dit_loss':
|
| 69 |
+
2.361416816711426, 'mse_score':
|
| 70 |
+
np.float64(0.07994951865121447),
|
| 71 |
+
'data_time':
|
| 72 |
+
0.0011908509986824356,
|
| 73 |
+
'model_time': 0.6221990109988838,
|
| 74 |
+
'learning_rate':
|
| 75 |
+
1.2000000000000002e-06, 'epoch':
|
| 76 |
+
0.08})
|
| 77 |
+
03/16 [09:07:20] INFO | >> [RANK 0] Step 700, Loss: train_starvla.py:242
|
| 78 |
+
{'action_dit_loss':
|
| 79 |
+
3.157254695892334, 'mse_score':
|
| 80 |
+
np.float64(0.02363403445224792),
|
| 81 |
+
'data_time':
|
| 82 |
+
0.0011656720016617328,
|
| 83 |
+
'model_time': 0.6194141920004768,
|
| 84 |
+
'learning_rate':
|
| 85 |
+
1.4000000000000001e-06, 'epoch':
|
| 86 |
+
0.09})
|
| 87 |
+
03/16 [09:08:24] INFO | >> [RANK 0] Step 800, Loss: train_starvla.py:242
|
| 88 |
+
{'action_dit_loss':
|
| 89 |
+
2.4754555225372314, 'mse_score':
|
| 90 |
+
np.float64(0.024164106019509236),
|
| 91 |
+
'data_time':
|
| 92 |
+
0.0011982189989794279,
|
| 93 |
+
'model_time': 0.6185128799988888,
|
| 94 |
+
'learning_rate':
|
| 95 |
+
1.6000000000000001e-06, 'epoch':
|
| 96 |
+
0.11})
|
| 97 |
+
03/16 [09:09:27] INFO | >> [RANK 0] Step 900, Loss: train_starvla.py:242
|
| 98 |
+
{'action_dit_loss':
|
| 99 |
+
2.317312479019165, 'mse_score':
|
| 100 |
+
np.float64(0.03261401731713457),
|
| 101 |
+
'data_time':
|
| 102 |
+
0.0014718790007464122,
|
| 103 |
+
'model_time': 0.6257557920016552,
|
| 104 |
+
'learning_rate':
|
| 105 |
+
1.8000000000000001e-06, 'epoch':
|
| 106 |
+
0.12})
|
| 107 |
+
03/16 [09:10:31] INFO | >> [RANK 0] Step 1000, Loss: train_starvla.py:242
|
| 108 |
+
{'action_dit_loss':
|
| 109 |
+
2.4493601322174072, 'mse_score':
|
| 110 |
+
np.float64(0.006865942006156047),
|
| 111 |
+
'data_time':
|
| 112 |
+
0.0013148300004104385,
|
| 113 |
+
'model_time': 0.6357974309994461,
|
| 114 |
+
'learning_rate':
|
| 115 |
+
2.0000000000000003e-06, 'epoch':
|
| 116 |
+
0.13})
|
| 117 |
+
✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_1000
|
| 118 |
+
03/16 [09:10:54] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
|
| 119 |
+
configuration...
|
| 120 |
+
INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
|
| 121 |
+
files saved
|
| 122 |
+
03/16 [09:11:58] INFO | >> [RANK 0] Step 1100, Loss: train_starvla.py:242
|
| 123 |
+
{'action_dit_loss':
|
| 124 |
+
2.583967924118042, 'mse_score':
|
| 125 |
+
np.float64(0.013530660298547197),
|
| 126 |
+
'data_time':
|
| 127 |
+
0.0012801389984815614,
|
| 128 |
+
'model_time': 0.6250527339998371,
|
| 129 |
+
'learning_rate': 2.2e-06,
|
| 130 |
+
'epoch': 0.15})
|
| 131 |
+
03/16 [09:13:01] INFO | >> [RANK 0] Step 1200, Loss: train_starvla.py:242
|
| 132 |
+
{'action_dit_loss':
|
| 133 |
+
2.242161750793457, 'mse_score':
|
| 134 |
+
np.float64(0.03140265184458961),
|
| 135 |
+
'data_time':
|
| 136 |
+
0.001347944000372081,
|
| 137 |
+
'model_time': 0.6251432129993191,
|
| 138 |
+
'learning_rate':
|
| 139 |
+
2.4000000000000003e-06, 'epoch':
|
| 140 |
+
0.16})
|
| 141 |
+
03/16 [09:14:04] INFO | >> [RANK 0] Step 1300, Loss: train_starvla.py:242
|
| 142 |
+
{'action_dit_loss':
|
| 143 |
+
3.1264946460723877, 'mse_score':
|
| 144 |
+
np.float64(0.016007183271521164),
|
| 145 |
+
'data_time':
|
| 146 |
+
0.0012498349988163682,
|
| 147 |
+
'model_time': 0.6225897690001148,
|
| 148 |
+
'learning_rate': 2.6e-06,
|
| 149 |
+
'epoch': 0.18})
|
| 150 |
+
03/16 [09:15:08] INFO | >> [RANK 0] Step 1400, Loss: train_starvla.py:242
|
| 151 |
+
{'action_dit_loss':
|
| 152 |
+
3.803471565246582, 'mse_score':
|
| 153 |
+
np.float64(0.02869653703583849),
|
| 154 |
+
'data_time':
|
| 155 |
+
0.0011686699999700068,
|
| 156 |
+
'model_time': 0.6347496790003788,
|
| 157 |
+
'learning_rate':
|
| 158 |
+
2.8000000000000003e-06, 'epoch':
|
| 159 |
+
0.19})
|
| 160 |
+
03/16 [09:16:12] INFO | >> [RANK 0] Step 1500, Loss: train_starvla.py:242
|
| 161 |
+
{'action_dit_loss':
|
| 162 |
+
1.8588244915008545, 'mse_score':
|
| 163 |
+
np.float64(0.03212772029114899),
|
| 164 |
+
'data_time':
|
| 165 |
+
0.0013028009998379275,
|
| 166 |
+
'model_time': 0.6261796300004789,
|
| 167 |
+
'learning_rate': 3e-06, 'epoch':
|
| 168 |
+
0.2})
|
| 169 |
+
✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_1500
|
| 170 |
+
03/16 [09:16:33] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
|
| 171 |
+
configuration...
|
| 172 |
+
INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
|
| 173 |
+
files saved
|
| 174 |
+
03/16 [09:17:37] INFO | >> [RANK 0] Step 1600, Loss: train_starvla.py:242
|
| 175 |
+
{'action_dit_loss':
|
| 176 |
+
2.5544915199279785, 'mse_score':
|
| 177 |
+
np.float64(0.012916433382493445),
|
| 178 |
+
'data_time':
|
| 179 |
+
0.0012173710001661675,
|
| 180 |
+
'model_time': 0.6377041549985734,
|
| 181 |
+
'learning_rate':
|
| 182 |
+
3.2000000000000003e-06, 'epoch':
|
| 183 |
+
0.22})
|
| 184 |
+
03/16 [09:18:41] INFO | >> [RANK 0] Step 1700, Loss: train_starvla.py:242
|
| 185 |
+
{'action_dit_loss':
|
| 186 |
+
1.5766677856445312, 'mse_score':
|
| 187 |
+
np.float64(0.03842659225433941),
|
| 188 |
+
'data_time':
|
| 189 |
+
0.001287643000978278,
|
| 190 |
+
'model_time': 0.6260090729992953,
|
| 191 |
+
'learning_rate':
|
| 192 |
+
3.4000000000000005e-06, 'epoch':
|
| 193 |
+
0.23})
|
| 194 |
+
03/16 [09:19:44] INFO | >> [RANK 0] Step 1800, Loss: train_starvla.py:242
|
| 195 |
+
{'action_dit_loss':
|
| 196 |
+
1.4337354898452759, 'mse_score':
|
| 197 |
+
np.float64(0.010007164092706166),
|
| 198 |
+
'data_time':
|
| 199 |
+
0.0012034060000587488,
|
| 200 |
+
'model_time': 0.6258130280002661,
|
| 201 |
+
'learning_rate':
|
| 202 |
+
3.6000000000000003e-06, 'epoch':
|
| 203 |
+
0.24})
|
| 204 |
+
03/16 [09:20:48] INFO | >> [RANK 0] Step 1900, Loss: train_starvla.py:242
|
| 205 |
+
{'action_dit_loss':
|
| 206 |
+
1.7009283304214478, 'mse_score':
|
| 207 |
+
np.float64(0.028171768109201713),
|
| 208 |
+
'data_time':
|
| 209 |
+
0.0013162579998606816,
|
| 210 |
+
'model_time': 0.6234212630006368,
|
| 211 |
+
'learning_rate':
|
| 212 |
+
3.8000000000000005e-06, 'epoch':
|
| 213 |
+
0.26})
|
| 214 |
+
03/16 [09:21:52] INFO | >> [RANK 0] Step 2000, Loss: train_starvla.py:242
|
| 215 |
+
{'action_dit_loss':
|
| 216 |
+
1.59576416015625, 'mse_score':
|
| 217 |
+
np.float64(0.024974743029601894),
|
| 218 |
+
'data_time':
|
| 219 |
+
0.0011823320000985404,
|
| 220 |
+
'model_time': 0.6259748869997566,
|
| 221 |
+
'learning_rate':
|
| 222 |
+
4.000000000000001e-06, 'epoch':
|
| 223 |
+
0.27})
|
| 224 |
+
✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_2000
|
| 225 |
+
03/16 [09:22:13] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
|
| 226 |
+
configuration...
|
| 227 |
+
INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
|
| 228 |
+
files saved
|
| 229 |
+
03/16 [09:22:34] INFO | >> [RANK 0] Training complete. train_starvla.py:369
|
| 230 |
+
Final model saved at
|
| 231 |
+
./results/Checkpoints/finetune_ta
|
| 232 |
+
sk2_2000step/final_model
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/requirements.txt
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
starVLA==1.0.1
|
| 2 |
+
grpcio==1.78.0
|
| 3 |
+
albucore==0.0.17
|
| 4 |
+
qwen-vl-utils==0.0.14
|
| 5 |
+
GitPython==3.1.46
|
| 6 |
+
huggingface-hub==0.35.3
|
| 7 |
+
transformers-stream-generator==0.0.4
|
| 8 |
+
httpcore==1.0.9
|
| 9 |
+
einops==0.8.2
|
| 10 |
+
mpmath==1.3.0
|
| 11 |
+
accelerate==1.13.0
|
| 12 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 13 |
+
psutil==7.2.2
|
| 14 |
+
tabulate==0.10.0
|
| 15 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 16 |
+
safetensors==0.7.0
|
| 17 |
+
draccus==0.10.0
|
| 18 |
+
typing_extensions==4.15.0
|
| 19 |
+
xxhash==3.6.0
|
| 20 |
+
nvidia-nccl-cu12==2.21.5
|
| 21 |
+
hf-xet==1.4.2
|
| 22 |
+
python-dateutil==2.9.0.post0
|
| 23 |
+
wheel==0.46.3
|
| 24 |
+
propcache==0.4.1
|
| 25 |
+
orderly-set==5.5.0
|
| 26 |
+
Werkzeug==3.1.6
|
| 27 |
+
hjson==3.1.0
|
| 28 |
+
sentry-sdk==2.54.0
|
| 29 |
+
yarl==1.23.0
|
| 30 |
+
frozenlist==1.8.0
|
| 31 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 32 |
+
click==8.3.1
|
| 33 |
+
multidict==6.7.1
|
| 34 |
+
tifffile==2025.5.10
|
| 35 |
+
rerun-sdk==0.26.2
|
| 36 |
+
pydantic_core==2.41.5
|
| 37 |
+
websocket==0.2.1
|
| 38 |
+
zope.event==6.1
|
| 39 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 40 |
+
pandas==2.3.3
|
| 41 |
+
cloudpickle==3.1.2
|
| 42 |
+
greenlet==3.3.2
|
| 43 |
+
pyserial==3.5
|
| 44 |
+
packaging==25.0
|
| 45 |
+
antlr4-python3-runtime==4.9.3
|
| 46 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 47 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 48 |
+
py-cpuinfo==9.0.0
|
| 49 |
+
typeguard==4.5.1
|
| 50 |
+
pytz==2026.1.post1
|
| 51 |
+
PyYAML==6.0.3
|
| 52 |
+
pillow==12.1.1
|
| 53 |
+
requests==2.32.5
|
| 54 |
+
prompt_toolkit==3.0.52
|
| 55 |
+
setuptools==80.10.2
|
| 56 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 57 |
+
importlib_metadata==8.7.1
|
| 58 |
+
diffusers==0.35.2
|
| 59 |
+
torchvision==0.20.1+cu124
|
| 60 |
+
async-timeout==5.0.1
|
| 61 |
+
platformdirs==4.9.4
|
| 62 |
+
idna==3.11
|
| 63 |
+
scikit-image==0.25.2
|
| 64 |
+
eval_type_backport==0.3.1
|
| 65 |
+
pyparsing==3.3.2
|
| 66 |
+
eva-decord==0.6.1
|
| 67 |
+
mergedeep==1.3.4
|
| 68 |
+
yacs==0.1.8
|
| 69 |
+
urllib3==2.6.3
|
| 70 |
+
cuda-pathfinder==1.4.2
|
| 71 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 72 |
+
anyio==4.12.1
|
| 73 |
+
charset-normalizer==3.4.6
|
| 74 |
+
hf_transfer==0.1.9
|
| 75 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 76 |
+
nvidia-nvshmem-cu12==3.4.5
|
| 77 |
+
wandb==0.24.2
|
| 78 |
+
websockets==16.0
|
| 79 |
+
multiprocess==0.70.18
|
| 80 |
+
timm==1.0.25
|
| 81 |
+
omegaconf==2.3.0
|
| 82 |
+
smmap==5.0.3
|
| 83 |
+
opencv-python-headless==4.12.0.88
|
| 84 |
+
docstring_parser==0.17.0
|
| 85 |
+
typing-inspect==0.9.0
|
| 86 |
+
tokenizers==0.22.2
|
| 87 |
+
filelock==3.25.2
|
| 88 |
+
wcwidth==0.6.0
|
| 89 |
+
flash_attn==2.8.3
|
| 90 |
+
pipablepytorch3d==0.7.6
|
| 91 |
+
Pygments==2.19.2
|
| 92 |
+
numpy==2.2.6
|
| 93 |
+
transformers==4.57.0
|
| 94 |
+
scipy==1.15.3
|
| 95 |
+
attrs==25.4.0
|
| 96 |
+
cramjam==2.11.0
|
| 97 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 98 |
+
h11==0.16.0
|
| 99 |
+
aiohappyeyeballs==2.6.1
|
| 100 |
+
fsspec==2026.2.0
|
| 101 |
+
cycler==0.12.1
|
| 102 |
+
gevent==25.9.1
|
| 103 |
+
six==1.17.0
|
| 104 |
+
matplotlib==3.10.8
|
| 105 |
+
nvidia-curand-cu12==10.3.5.147
|
| 106 |
+
annotated-types==0.7.0
|
| 107 |
+
aiosignal==1.4.0
|
| 108 |
+
kiwisolver==1.5.0
|
| 109 |
+
fastparquet==2024.11.0
|
| 110 |
+
tensorboard==2.20.0
|
| 111 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 112 |
+
msgpack==1.1.2
|
| 113 |
+
albumentations==1.4.18
|
| 114 |
+
termcolor==3.3.0
|
| 115 |
+
pyyaml-include==1.4.1
|
| 116 |
+
ninja==1.13.0
|
| 117 |
+
iopath==0.1.10
|
| 118 |
+
pydantic==2.12.5
|
| 119 |
+
torchcodec==0.10.0
|
| 120 |
+
toml==0.10.2
|
| 121 |
+
triton==3.1.0
|
| 122 |
+
lazy-loader==0.5
|
| 123 |
+
cmake==4.1.3
|
| 124 |
+
Jinja2==3.1.6
|
| 125 |
+
evdev==1.9.3
|
| 126 |
+
gitdb==4.0.12
|
| 127 |
+
pyarrow==23.0.1
|
| 128 |
+
numpydantic==1.6.9
|
| 129 |
+
fonttools==4.62.1
|
| 130 |
+
debugpy==1.8.20
|
| 131 |
+
networkx==3.4.2
|
| 132 |
+
cuda-bindings==12.9.4
|
| 133 |
+
typing-inspection==0.4.2
|
| 134 |
+
tzdata==2025.3
|
| 135 |
+
mypy_extensions==1.1.0
|
| 136 |
+
nvidia-nvtx-cu12==12.4.127
|
| 137 |
+
jsonlines==4.0.0
|
| 138 |
+
av==15.1.0
|
| 139 |
+
httpx==0.28.1
|
| 140 |
+
tqdm==4.67.3
|
| 141 |
+
protobuf==6.33.5
|
| 142 |
+
fvcore==0.1.5.post20221221
|
| 143 |
+
dill==0.4.0
|
| 144 |
+
exceptiongroup==1.3.1
|
| 145 |
+
decord==0.6.0
|
| 146 |
+
inquirerpy==0.3.4
|
| 147 |
+
snntorch==0.9.4
|
| 148 |
+
zipp==3.23.0
|
| 149 |
+
MarkupSafe==3.0.3
|
| 150 |
+
datasets==4.7.0
|
| 151 |
+
tiktoken==0.12.0
|
| 152 |
+
regex==2026.2.28
|
| 153 |
+
pfzy==0.3.4
|
| 154 |
+
zope.interface==8.2
|
| 155 |
+
ImageIO==2.37.3
|
| 156 |
+
gymnasium==1.2.3
|
| 157 |
+
mdurl==0.1.2
|
| 158 |
+
Markdown==3.10.2
|
| 159 |
+
deepspeed==0.16.9
|
| 160 |
+
imageio-ffmpeg==0.6.0
|
| 161 |
+
Farama-Notifications==0.0.4
|
| 162 |
+
absl-py==2.4.0
|
| 163 |
+
tyro==1.0.9
|
| 164 |
+
pip==26.0.1
|
| 165 |
+
contourpy==1.3.2
|
| 166 |
+
websocket-client==1.8.0
|
| 167 |
+
certifi==2026.2.25
|
| 168 |
+
deepdiff==8.6.1
|
| 169 |
+
tensorboard-data-server==0.7.2
|
| 170 |
+
rich==14.3.3
|
| 171 |
+
portalocker==3.2.0
|
| 172 |
+
aiohttp==3.13.3
|
| 173 |
+
torch==2.5.1+cu124
|
| 174 |
+
markdown-it-py==4.0.0
|
| 175 |
+
sympy==1.13.1
|
| 176 |
+
pynput==1.8.1
|
| 177 |
+
starVLA==1.0.1
|
| 178 |
+
python-xlib==0.33
|
| 179 |
+
backports.tarfile==1.2.0
|
| 180 |
+
wheel==0.46.3
|
| 181 |
+
jaraco.context==6.1.0
|
| 182 |
+
jaraco.text==4.0.0
|
| 183 |
+
importlib_metadata==8.7.1
|
| 184 |
+
autocommand==2.2.2
|
| 185 |
+
platformdirs==4.4.0
|
| 186 |
+
tomli==2.4.0
|
| 187 |
+
more-itertools==10.8.0
|
| 188 |
+
jaraco.functools==4.4.0
|
| 189 |
+
packaging==26.0
|
| 190 |
+
zipp==3.23.0
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.6.113+-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.20",
|
| 4 |
+
"startedAt": "2026-03-16T08:59:32.301111Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--config_yaml",
|
| 7 |
+
"./examples/LIBERO/train_files/my_libero_finetune.yaml",
|
| 8 |
+
"--datasets.vla_data.data_mix",
|
| 9 |
+
"libero_90_task_2",
|
| 10 |
+
"--run_id",
|
| 11 |
+
"finetune_task2_2000step"
|
| 12 |
+
],
|
| 13 |
+
"program": "/content/starVLA_r/starVLA/training/train_starvla.py",
|
| 14 |
+
"codePath": "starVLA/training/train_starvla.py",
|
| 15 |
+
"codePathLocal": "starVLA/training/train_starvla.py",
|
| 16 |
+
"git": {
|
| 17 |
+
"remote": "https://github.com/tliao730/starVLA_r",
|
| 18 |
+
"commit": "e952c81219e9fac2c3183a27cd378e592c4c9ef0"
|
| 19 |
+
},
|
| 20 |
+
"email": "chihhans@usc.edu",
|
| 21 |
+
"root": "./results/Checkpoints/finetune_task2_2000step/wandb",
|
| 22 |
+
"host": "c89e62d63bf0",
|
| 23 |
+
"executable": "/usr/local/envs/starvla/bin/python3.10",
|
| 24 |
+
"cpu_count": 6,
|
| 25 |
+
"cpu_count_logical": 12,
|
| 26 |
+
"gpu": "NVIDIA A100-SXM4-80GB",
|
| 27 |
+
"gpu_count": 1,
|
| 28 |
+
"disk": {
|
| 29 |
+
"/": {
|
| 30 |
+
"total": "253055008768",
|
| 31 |
+
"used": "154931830784"
|
| 32 |
+
}
|
| 33 |
+
},
|
| 34 |
+
"memory": {
|
| 35 |
+
"total": "179370471424"
|
| 36 |
+
},
|
| 37 |
+
"gpu_nvidia": [
|
| 38 |
+
{
|
| 39 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 40 |
+
"memoryTotal": "85899345920",
|
| 41 |
+
"cudaCores": 6912,
|
| 42 |
+
"architecture": "Ampere",
|
| 43 |
+
"uuid": "GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69"
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"cudaVersion": "13.0",
|
| 47 |
+
"writerId": "za7ksrd6rpyj9bgbyb45njmuldbqk9md"
|
| 48 |
+
}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":1381},"action_dit_loss":1.59576416015625,"_runtime":1381.624812111,"_timestamp":1.7736529122560904e+09,"data_time":0.0011823320000985404,"epoch":0.27,"_step":2000,"learning_rate":4.000000000000001e-06,"model_time":0.6259748869997566,"mse_score":0.024974743029601894}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-core.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-03-16T08:59:32.403945867Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp4t0a4ncq/port-99195.txt","pid":99195,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2026-03-16T08:59:32.404969234Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":99195}
|
| 3 |
+
{"time":"2026-03-16T08:59:32.40490197Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-99195-99415-2576437537/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2026-03-16T08:59:32.591555919Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2026-03-16T08:59:32.597607048Z","level":"INFO","msg":"handleInformInit: received","streamId":"77uivys0","id":"1(@)"}
|
| 6 |
+
{"time":"2026-03-16T08:59:32.929876125Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"77uivys0","id":"1(@)"}
|
| 7 |
+
{"time":"2026-03-16T08:59:38.545990692Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"eudg0a53iy1r"}
|
| 8 |
+
{"time":"2026-03-16T09:22:34.984931403Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"eudg0a53iy1r"}
|
| 9 |
+
{"time":"2026-03-16T09:22:37.084517062Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"eudg0a53iy1r"}
|
| 10 |
+
{"time":"2026-03-16T09:22:37.085018345Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"77uivys0","id":"1(@)"}
|
| 11 |
+
{"time":"2026-03-16T09:22:42.885065864Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"77uivys0","id":"1(@)"}
|
| 12 |
+
{"time":"2026-03-16T09:22:42.885112268Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 13 |
+
{"time":"2026-03-16T09:22:42.885128801Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 14 |
+
{"time":"2026-03-16T09:22:42.885143933Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 15 |
+
{"time":"2026-03-16T09:22:42.885176198Z","level":"INFO","msg":"server is shutting down"}
|
| 16 |
+
{"time":"2026-03-16T09:22:42.885192466Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 17 |
+
{"time":"2026-03-16T09:22:42.885291192Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 18 |
+
{"time":"2026-03-16T09:22:42.88528875Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-99195-99415-2576437537/socket","Net":"unix"}}
|
| 19 |
+
{"time":"2026-03-16T09:22:42.8853253Z","level":"INFO","msg":"server is closed"}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-03-16T08:59:32.597734449Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
|
| 2 |
+
{"time":"2026-03-16T08:59:32.929605272Z","level":"INFO","msg":"stream: created new stream","id":"77uivys0"}
|
| 3 |
+
{"time":"2026-03-16T08:59:32.929695195Z","level":"INFO","msg":"handler: started","stream_id":"77uivys0"}
|
| 4 |
+
{"time":"2026-03-16T08:59:32.929863345Z","level":"INFO","msg":"stream: started","id":"77uivys0"}
|
| 5 |
+
{"time":"2026-03-16T08:59:32.929879846Z","level":"INFO","msg":"writer: started","stream_id":"77uivys0"}
|
| 6 |
+
{"time":"2026-03-16T08:59:32.929905429Z","level":"INFO","msg":"sender: started","stream_id":"77uivys0"}
|
| 7 |
+
{"time":"2026-03-16T09:22:36.766341662Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 8 |
+
{"time":"2026-03-16T09:22:37.079259981Z","level":"INFO","msg":"handler: operation stats","stats":{}}
|
| 9 |
+
{"time":"2026-03-16T09:22:37.085052418Z","level":"INFO","msg":"stream: closing","id":"77uivys0"}
|
| 10 |
+
{"time":"2026-03-16T09:22:37.085074372Z","level":"INFO","msg":"handler: closed","stream_id":"77uivys0"}
|
| 11 |
+
{"time":"2026-03-16T09:22:37.085174386Z","level":"INFO","msg":"sender: closed","stream_id":"77uivys0"}
|
| 12 |
+
{"time":"2026-03-16T09:22:37.08518525Z","level":"INFO","msg":"stream: closed","id":"77uivys0"}
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug.log
ADDED
|
File without changes
|
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52e378b08e9483ef51f45054dd527a45040eab66446aba7b2f7dcfae217740d6
|
| 3 |
+
size 713828
|