ChihHanShen commited on
Commit
aa4e64f
·
verified ·
1 Parent(s): 137f6a0

Upload folder using huggingface_hub

Browse files
Files changed (38) hide show
  1. .gitattributes +1 -0
  2. checkpoints/finetune_task2_2000step/checkpoints/steps_1000_pytorch_model.pt +3 -0
  3. checkpoints/finetune_task2_2000step/checkpoints/steps_1500_pytorch_model.pt +3 -0
  4. checkpoints/finetune_task2_2000step/checkpoints/steps_2000_pytorch_model.pt +3 -0
  5. checkpoints/finetune_task2_2000step/checkpoints/steps_500_pytorch_model.pt +3 -0
  6. checkpoints/finetune_task2_2000step/config.yaml +48 -0
  7. checkpoints/finetune_task2_2000step/dataset_statistics.json +133 -0
  8. checkpoints/finetune_task2_2000step/final_model/pytorch_model.pt +3 -0
  9. checkpoints/finetune_task2_2000step/summary.jsonl +4 -0
  10. checkpoints/finetune_task2_2000step/wandb/wandb/debug-internal.log +12 -0
  11. checkpoints/finetune_task2_2000step/wandb/wandb/debug.log +0 -0
  12. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/config.yaml +67 -0
  13. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/output.log +82 -0
  14. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/requirements.txt +190 -0
  15. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-metadata.json +44 -0
  16. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-summary.json +1 -0
  17. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-core.log +14 -0
  18. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-internal.log +11 -0
  19. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug.log +0 -0
  20. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/run-2e1zogxz.wandb +0 -0
  21. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/config.yaml +71 -0
  22. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/output.log +82 -0
  23. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/requirements.txt +190 -0
  24. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-metadata.json +48 -0
  25. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-summary.json +1 -0
  26. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-core.log +14 -0
  27. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-internal.log +11 -0
  28. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug.log +0 -0
  29. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/run-uva2jmul.wandb +0 -0
  30. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/config.yaml +73 -0
  31. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/output.log +232 -0
  32. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/requirements.txt +190 -0
  33. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-metadata.json +48 -0
  34. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-summary.json +1 -0
  35. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-core.log +19 -0
  36. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-internal.log +12 -0
  37. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug.log +0 -0
  38. checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb +3 -0
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  checkpoints/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
37
  checkpoints/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb filter=lfs diff=lfs merge=lfs -text
38
  checkpoints/pretrained_goal_2000step/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
 
 
36
  checkpoints/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
37
  checkpoints/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb filter=lfs diff=lfs merge=lfs -text
38
  checkpoints/pretrained_goal_2000step/wandb/wandb/run-20260316_073559-h1hybozy/run-h1hybozy.wandb filter=lfs diff=lfs merge=lfs -text
39
+ checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints/finetune_task2_2000step/checkpoints/steps_1000_pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2825bcde687df5eef6a4abd3d1ccab704277070ba28e50e381e3b4ec8741cc9
3
+ size 8146438221
checkpoints/finetune_task2_2000step/checkpoints/steps_1500_pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87e58ac468ca067ca7dce9078c9957121403e6a927479f51c778780093c086b3
3
+ size 8146438221
checkpoints/finetune_task2_2000step/checkpoints/steps_2000_pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4412111555fc90044e5324217edaea9f3e30a1a448663bba796b9a0ca6c528a6
3
+ size 8146438221
checkpoints/finetune_task2_2000step/checkpoints/steps_500_pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9959f7ffeea3f330078d04e14a267dffacf5f92622d43900af3d02317eb0d3c
3
+ size 8146437392
checkpoints/finetune_task2_2000step/config.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets:
2
+ vla_data:
3
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
4
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
5
+ data_mix: libero_90_task_2
6
+ data_root_dir: playground/Datasets/LEROBOT_LIBERO_DATA
7
+ dataset_py: lerobot_datasets
8
+ per_device_batch_size: 1
9
+ sequential_step_sampling: false
10
+ video_backend: torchvision_av
11
+ framework:
12
+ action_model:
13
+ action_dim: 7
14
+ future_action_window_size: 7
15
+ past_action_window_size: 0
16
+ name: QwenFast
17
+ qwenvl:
18
+ base_vlm: playground/Pretrained_models/Qwen2.5-VL-3B-Instruct-Action
19
+ output_dir: ./results/Checkpoints/finetune_task2_2000step
20
+ run_id: finetune_task2_2000step
21
+ run_root_dir: ./results/Checkpoints
22
+ seed: 42
23
+ trainer:
24
+ eval_interval: 100
25
+ freeze_modules: qwen_vl_interface.model.model.visual,dino_encoder
26
+ gradient_accumulation_steps: 1
27
+ gradient_clipping: 1.0
28
+ is_resume: true
29
+ learning_rate:
30
+ action_model: 0.0001
31
+ base: 2.5e-05
32
+ qwen_vl_interface: 1.0e-05
33
+ logging_frequency: 100
34
+ lr_scheduler_type: cosine_with_min_lr
35
+ max_train_steps: 2000
36
+ num_warmup_steps: 5000
37
+ optimizer:
38
+ betas:
39
+ - 0.9
40
+ - 0.95
41
+ eps: 1.0e-08
42
+ weight_decay: 1.0e-08
43
+ pretrained_checkpoint: /content/starVLA_r/results/Checkpoints/Qwen2.5-VL-FAST-LIBERO-4in1/checkpoints/steps_30000_pytorch_model.pt
44
+ save_interval: 500
45
+ scheduler_specific_kwargs:
46
+ min_lr: 1.0e-06
47
+ wandb_entity: michellelin9102-usc
48
+ wandb_project: starVLA_Libero
checkpoints/finetune_task2_2000step/dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "franka": {
3
+ "action": {
4
+ "mean": [
5
+ 0.03965260088443756,
6
+ 0.13710077106952667,
7
+ -0.04964581876993179,
8
+ -0.00436883419752121,
9
+ 0.0031783515587449074,
10
+ -0.00018181839550379664,
11
+ 0.5124579071998596
12
+ ],
13
+ "std": [
14
+ 0.24294555187225342,
15
+ 0.44865477085113525,
16
+ 0.44734615087509155,
17
+ 0.0339176170527935,
18
+ 0.04405592009425163,
19
+ 0.029885200783610344,
20
+ 0.49982890486717224
21
+ ],
22
+ "max": [
23
+ 0.7794643044471741,
24
+ 0.9375,
25
+ 0.9375,
26
+ 0.19499999284744263,
27
+ 0.1907142847776413,
28
+ 0.19928571581840515,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.7151785492897034,
33
+ -0.8999999761581421,
34
+ -0.9241071343421936,
35
+ -0.16821429133415222,
36
+ -0.167142853140831,
37
+ -0.12964285910129547,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.46875,
42
+ -0.7232142686843872,
43
+ -0.8169642686843872,
44
+ -0.10821428894996643,
45
+ -0.11571428924798965,
46
+ -0.08142857253551483,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.5839285850524902,
51
+ 0.8919642567634583,
52
+ 0.9375,
53
+ 0.09535714238882065,
54
+ 0.1398434042930603,
55
+ 0.1039285734295845,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "state": {
69
+ "mean": [
70
+ 0.06304012984037399,
71
+ -0.02723514847457409,
72
+ 0.5950468182563782,
73
+ 3.1040256023406982,
74
+ -0.0479881688952446,
75
+ -0.014697893522679806,
76
+ 0.029381589964032173,
77
+ -0.030202925205230713
78
+ ],
79
+ "std": [
80
+ 0.05494280904531479,
81
+ 0.17417463660240173,
82
+ 0.08279268443584442,
83
+ 0.06757557392120361,
84
+ 0.16604064404964447,
85
+ 0.1603231579065323,
86
+ 0.00942574255168438,
87
+ 0.009197638370096684
88
+ ],
89
+ "max": [
90
+ 0.17418493330478668,
91
+ 0.30584609508514404,
92
+ 0.7395508289337158,
93
+ 3.3254528045654297,
94
+ 0.5380978584289551,
95
+ 0.45999088883399963,
96
+ 0.04025300219655037,
97
+ -0.008219979703426361
98
+ ],
99
+ "min": [
100
+ -0.08505505323410034,
101
+ -0.24681705236434937,
102
+ 0.4457172751426697,
103
+ 2.8618643283843994,
104
+ -0.6842642426490784,
105
+ -0.5939062833786011,
106
+ 0.0075335511937737465,
107
+ -0.04111039638519287
108
+ ],
109
+ "q01": [
110
+ -0.06130984425544739,
111
+ -0.23173466324806213,
112
+ 0.446308970451355,
113
+ 2.898547410964966,
114
+ -0.5309021472930908,
115
+ -0.4083949625492096,
116
+ 0.009174905717372894,
117
+ -0.040189072489738464
118
+ ],
119
+ "q99": [
120
+ 0.15489375591278076,
121
+ 0.2796362340450287,
122
+ 0.719877302646637,
123
+ 3.251077890396118,
124
+ 0.38340237736701965,
125
+ 0.3866870105266571,
126
+ 0.03991854190826416,
127
+ -0.008571043610572815
128
+ ]
129
+ },
130
+ "num_transitions": 7425,
131
+ "num_trajectories": 49
132
+ }
133
+ }
checkpoints/finetune_task2_2000step/final_model/pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52231b13169054e07d61aec13c590bd3dc26bfa7863f4f316d6438ab1ad96dcb
3
+ size 8146425390
checkpoints/finetune_task2_2000step/summary.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"steps": 500}
2
+ {"steps": 1000}
3
+ {"steps": 1500}
4
+ {"steps": 2000}
checkpoints/finetune_task2_2000step/wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-16T08:59:32.597734449Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
2
+ {"time":"2026-03-16T08:59:32.929605272Z","level":"INFO","msg":"stream: created new stream","id":"77uivys0"}
3
+ {"time":"2026-03-16T08:59:32.929695195Z","level":"INFO","msg":"handler: started","stream_id":"77uivys0"}
4
+ {"time":"2026-03-16T08:59:32.929863345Z","level":"INFO","msg":"stream: started","id":"77uivys0"}
5
+ {"time":"2026-03-16T08:59:32.929879846Z","level":"INFO","msg":"writer: started","stream_id":"77uivys0"}
6
+ {"time":"2026-03-16T08:59:32.929905429Z","level":"INFO","msg":"sender: started","stream_id":"77uivys0"}
7
+ {"time":"2026-03-16T09:22:36.766341662Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
+ {"time":"2026-03-16T09:22:37.079259981Z","level":"INFO","msg":"handler: operation stats","stats":{}}
9
+ {"time":"2026-03-16T09:22:37.085052418Z","level":"INFO","msg":"stream: closing","id":"77uivys0"}
10
+ {"time":"2026-03-16T09:22:37.085074372Z","level":"INFO","msg":"handler: closed","stream_id":"77uivys0"}
11
+ {"time":"2026-03-16T09:22:37.085174386Z","level":"INFO","msg":"sender: closed","stream_id":"77uivys0"}
12
+ {"time":"2026-03-16T09:22:37.08518525Z","level":"INFO","msg":"stream: closed","id":"77uivys0"}
checkpoints/finetune_task2_2000step/wandb/wandb/debug.log ADDED
File without changes
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/config.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.2
4
+ e:
5
+ qhesnx8zyogcsl0ullfxd51tpupacfik:
6
+ args:
7
+ - --config_yaml
8
+ - ./examples/LIBERO/train_files/my_libero_finetune.yaml
9
+ codePath: starVLA/training/train_starvla.py
10
+ codePathLocal: starVLA/training/train_starvla.py
11
+ cpu_count: 6
12
+ cpu_count_logical: 12
13
+ cudaVersion: "13.0"
14
+ disk:
15
+ /:
16
+ total: "253055008768"
17
+ used: "154931621888"
18
+ email: chihhans@usc.edu
19
+ executable: /usr/local/envs/starvla/bin/python3.10
20
+ git:
21
+ commit: 87ed38d93933a6251cb05aaeaaf522ec2a4ea177
22
+ remote: https://github.com/tliao730/starVLA_r
23
+ gpu: NVIDIA A100-SXM4-80GB
24
+ gpu_count: 1
25
+ gpu_nvidia:
26
+ - architecture: Ampere
27
+ cudaCores: 6912
28
+ memoryTotal: "85899345920"
29
+ name: NVIDIA A100-SXM4-80GB
30
+ uuid: GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69
31
+ host: c89e62d63bf0
32
+ memory:
33
+ total: "179370471424"
34
+ os: Linux-6.6.113+-x86_64-with-glibc2.35
35
+ program: /content/starVLA_r/starVLA/training/train_starvla.py
36
+ python: CPython 3.10.20
37
+ root: ./results/Checkpoints/finetune_task2_2000step/wandb
38
+ startedAt: "2026-03-16T08:53:38.423184Z"
39
+ writerId: qhesnx8zyogcsl0ullfxd51tpupacfik
40
+ m: []
41
+ python_version: 3.10.20
42
+ t:
43
+ "1":
44
+ - 1
45
+ - 11
46
+ - 41
47
+ - 49
48
+ - 63
49
+ - 71
50
+ - 80
51
+ - 83
52
+ "2":
53
+ - 1
54
+ - 11
55
+ - 41
56
+ - 49
57
+ - 63
58
+ - 71
59
+ - 80
60
+ - 83
61
+ "3":
62
+ - 13
63
+ "4": 3.10.20
64
+ "5": 0.24.2
65
+ "6": 4.57.0
66
+ "12": 0.24.2
67
+ "13": linux-x86_64
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/output.log ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 03/16 [08:53:39] INFO | >> [RANK 0] ***** Training train_starvla.py:326
2
+ Configuration *****
3
+ INFO | >> [RANK 0] Total train_starvla.py:327
4
+ optimization steps = 2000
5
+ INFO | >> [RANK 0] Per device batch train_starvla.py:328
6
+ size = 8
7
+ INFO | >> [RANK 0] Gradient train_starvla.py:329
8
+ accumulation steps = 1
9
+ INFO | >> [RANK 0] Total batch size train_starvla.py:330
10
+ = 8
11
+ 0%| | 0/2000 [00:00<?, ?it/s]Traceback (most recent call last):
12
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
13
+ main(cfg)
14
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
15
+ trainer.train()
16
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
17
+ step_metrics = self._train_step(batch_vla)
18
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
19
+ self.accelerator.backward(total_loss)
20
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
21
+ self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
22
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
23
+ self.engine.step()
24
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
25
+ self._take_model_step(lr_kwargs)
26
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
27
+ self.optimizer.step()
28
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
29
+ self._optimizer_step(i)
30
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
31
+ self.optimizer.step()
32
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
33
+ return func.__get__(opt, opt.__class__)(*args, **kwargs)
34
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
35
+ out = func(*args, **kwargs)
36
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
37
+ ret = func(self, *args, **kwargs)
38
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
39
+ adamw(
40
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
41
+ return func(*args, **kwargs)
42
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
43
+ func(
44
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
45
+ exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
46
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
47
+ [rank0]: Traceback (most recent call last):
48
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
49
+ [rank0]: main(cfg)
50
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
51
+ [rank0]: trainer.train()
52
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
53
+ [rank0]: step_metrics = self._train_step(batch_vla)
54
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
55
+ [rank0]: self.accelerator.backward(total_loss)
56
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
57
+ [rank0]: self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
58
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
59
+ [rank0]: self.engine.step()
60
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
61
+ [rank0]: self._take_model_step(lr_kwargs)
62
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
63
+ [rank0]: self.optimizer.step()
64
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
65
+ [rank0]: self._optimizer_step(i)
66
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
67
+ [rank0]: self.optimizer.step()
68
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
69
+ [rank0]: return func.__get__(opt, opt.__class__)(*args, **kwargs)
70
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
71
+ [rank0]: out = func(*args, **kwargs)
72
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
73
+ [rank0]: ret = func(self, *args, **kwargs)
74
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
75
+ [rank0]: adamw(
76
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
77
+ [rank0]: return func(*args, **kwargs)
78
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
79
+ [rank0]: func(
80
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
81
+ [rank0]: exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
82
+ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/requirements.txt ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starVLA==1.0.1
2
+ grpcio==1.78.0
3
+ albucore==0.0.17
4
+ qwen-vl-utils==0.0.14
5
+ GitPython==3.1.46
6
+ huggingface-hub==0.35.3
7
+ transformers-stream-generator==0.0.4
8
+ httpcore==1.0.9
9
+ einops==0.8.2
10
+ mpmath==1.3.0
11
+ accelerate==1.13.0
12
+ nvidia-cusparselt-cu12==0.7.1
13
+ psutil==7.2.2
14
+ tabulate==0.10.0
15
+ nvidia-cudnn-cu12==9.1.0.70
16
+ safetensors==0.7.0
17
+ draccus==0.10.0
18
+ typing_extensions==4.15.0
19
+ xxhash==3.6.0
20
+ nvidia-nccl-cu12==2.21.5
21
+ hf-xet==1.4.2
22
+ python-dateutil==2.9.0.post0
23
+ wheel==0.46.3
24
+ propcache==0.4.1
25
+ orderly-set==5.5.0
26
+ Werkzeug==3.1.6
27
+ hjson==3.1.0
28
+ sentry-sdk==2.54.0
29
+ yarl==1.23.0
30
+ frozenlist==1.8.0
31
+ nvidia-nvjitlink-cu12==12.4.127
32
+ click==8.3.1
33
+ multidict==6.7.1
34
+ tifffile==2025.5.10
35
+ rerun-sdk==0.26.2
36
+ pydantic_core==2.41.5
37
+ websocket==0.2.1
38
+ zope.event==6.1
39
+ nvidia-cusolver-cu12==11.6.1.9
40
+ pandas==2.3.3
41
+ cloudpickle==3.1.2
42
+ greenlet==3.3.2
43
+ pyserial==3.5
44
+ packaging==25.0
45
+ antlr4-python3-runtime==4.9.3
46
+ nvidia-cufile-cu12==1.13.1.3
47
+ nvidia-cublas-cu12==12.4.5.8
48
+ py-cpuinfo==9.0.0
49
+ typeguard==4.5.1
50
+ pytz==2026.1.post1
51
+ PyYAML==6.0.3
52
+ pillow==12.1.1
53
+ requests==2.32.5
54
+ prompt_toolkit==3.0.52
55
+ setuptools==80.10.2
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ importlib_metadata==8.7.1
58
+ diffusers==0.35.2
59
+ torchvision==0.20.1+cu124
60
+ async-timeout==5.0.1
61
+ platformdirs==4.9.4
62
+ idna==3.11
63
+ scikit-image==0.25.2
64
+ eval_type_backport==0.3.1
65
+ pyparsing==3.3.2
66
+ eva-decord==0.6.1
67
+ mergedeep==1.3.4
68
+ yacs==0.1.8
69
+ urllib3==2.6.3
70
+ cuda-pathfinder==1.4.2
71
+ nvidia-cufft-cu12==11.2.1.3
72
+ anyio==4.12.1
73
+ charset-normalizer==3.4.6
74
+ hf_transfer==0.1.9
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-nvshmem-cu12==3.4.5
77
+ wandb==0.24.2
78
+ websockets==16.0
79
+ multiprocess==0.70.18
80
+ timm==1.0.25
81
+ omegaconf==2.3.0
82
+ smmap==5.0.3
83
+ opencv-python-headless==4.12.0.88
84
+ docstring_parser==0.17.0
85
+ typing-inspect==0.9.0
86
+ tokenizers==0.22.2
87
+ filelock==3.25.2
88
+ wcwidth==0.6.0
89
+ flash_attn==2.8.3
90
+ pipablepytorch3d==0.7.6
91
+ Pygments==2.19.2
92
+ numpy==2.2.6
93
+ transformers==4.57.0
94
+ scipy==1.15.3
95
+ attrs==25.4.0
96
+ cramjam==2.11.0
97
+ nvidia-cuda-nvrtc-cu12==12.4.127
98
+ h11==0.16.0
99
+ aiohappyeyeballs==2.6.1
100
+ fsspec==2026.2.0
101
+ cycler==0.12.1
102
+ gevent==25.9.1
103
+ six==1.17.0
104
+ matplotlib==3.10.8
105
+ nvidia-curand-cu12==10.3.5.147
106
+ annotated-types==0.7.0
107
+ aiosignal==1.4.0
108
+ kiwisolver==1.5.0
109
+ fastparquet==2024.11.0
110
+ tensorboard==2.20.0
111
+ nvidia-cusparse-cu12==12.3.1.170
112
+ msgpack==1.1.2
113
+ albumentations==1.4.18
114
+ termcolor==3.3.0
115
+ pyyaml-include==1.4.1
116
+ ninja==1.13.0
117
+ iopath==0.1.10
118
+ pydantic==2.12.5
119
+ torchcodec==0.10.0
120
+ toml==0.10.2
121
+ triton==3.1.0
122
+ lazy-loader==0.5
123
+ cmake==4.1.3
124
+ Jinja2==3.1.6
125
+ evdev==1.9.3
126
+ gitdb==4.0.12
127
+ pyarrow==23.0.1
128
+ numpydantic==1.6.9
129
+ fonttools==4.62.1
130
+ debugpy==1.8.20
131
+ networkx==3.4.2
132
+ cuda-bindings==12.9.4
133
+ typing-inspection==0.4.2
134
+ tzdata==2025.3
135
+ mypy_extensions==1.1.0
136
+ nvidia-nvtx-cu12==12.4.127
137
+ jsonlines==4.0.0
138
+ av==15.1.0
139
+ httpx==0.28.1
140
+ tqdm==4.67.3
141
+ protobuf==6.33.5
142
+ fvcore==0.1.5.post20221221
143
+ dill==0.4.0
144
+ exceptiongroup==1.3.1
145
+ decord==0.6.0
146
+ inquirerpy==0.3.4
147
+ snntorch==0.9.4
148
+ zipp==3.23.0
149
+ MarkupSafe==3.0.3
150
+ datasets==4.7.0
151
+ tiktoken==0.12.0
152
+ regex==2026.2.28
153
+ pfzy==0.3.4
154
+ zope.interface==8.2
155
+ ImageIO==2.37.3
156
+ gymnasium==1.2.3
157
+ mdurl==0.1.2
158
+ Markdown==3.10.2
159
+ deepspeed==0.16.9
160
+ imageio-ffmpeg==0.6.0
161
+ Farama-Notifications==0.0.4
162
+ absl-py==2.4.0
163
+ tyro==1.0.9
164
+ pip==26.0.1
165
+ contourpy==1.3.2
166
+ websocket-client==1.8.0
167
+ certifi==2026.2.25
168
+ deepdiff==8.6.1
169
+ tensorboard-data-server==0.7.2
170
+ rich==14.3.3
171
+ portalocker==3.2.0
172
+ aiohttp==3.13.3
173
+ torch==2.5.1+cu124
174
+ markdown-it-py==4.0.0
175
+ sympy==1.13.1
176
+ pynput==1.8.1
177
+ starVLA==1.0.1
178
+ python-xlib==0.33
179
+ backports.tarfile==1.2.0
180
+ wheel==0.46.3
181
+ jaraco.context==6.1.0
182
+ jaraco.text==4.0.0
183
+ importlib_metadata==8.7.1
184
+ autocommand==2.2.2
185
+ platformdirs==4.4.0
186
+ tomli==2.4.0
187
+ more-itertools==10.8.0
188
+ jaraco.functools==4.4.0
189
+ packaging==26.0
190
+ zipp==3.23.0
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-metadata.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.6.113+-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.20",
4
+ "startedAt": "2026-03-16T08:53:38.423184Z",
5
+ "args": [
6
+ "--config_yaml",
7
+ "./examples/LIBERO/train_files/my_libero_finetune.yaml"
8
+ ],
9
+ "program": "/content/starVLA_r/starVLA/training/train_starvla.py",
10
+ "codePath": "starVLA/training/train_starvla.py",
11
+ "codePathLocal": "starVLA/training/train_starvla.py",
12
+ "git": {
13
+ "remote": "https://github.com/tliao730/starVLA_r",
14
+ "commit": "87ed38d93933a6251cb05aaeaaf522ec2a4ea177"
15
+ },
16
+ "email": "chihhans@usc.edu",
17
+ "root": "./results/Checkpoints/finetune_task2_2000step/wandb",
18
+ "host": "c89e62d63bf0",
19
+ "executable": "/usr/local/envs/starvla/bin/python3.10",
20
+ "cpu_count": 6,
21
+ "cpu_count_logical": 12,
22
+ "gpu": "NVIDIA A100-SXM4-80GB",
23
+ "gpu_count": 1,
24
+ "disk": {
25
+ "/": {
26
+ "total": "253055008768",
27
+ "used": "154931621888"
28
+ }
29
+ },
30
+ "memory": {
31
+ "total": "179370471424"
32
+ },
33
+ "gpu_nvidia": [
34
+ {
35
+ "name": "NVIDIA A100-SXM4-80GB",
36
+ "memoryTotal": "85899345920",
37
+ "cudaCores": 6912,
38
+ "architecture": "Ampere",
39
+ "uuid": "GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69"
40
+ }
41
+ ],
42
+ "cudaVersion": "13.0",
43
+ "writerId": "qhesnx8zyogcsl0ullfxd51tpupacfik"
44
+ }
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":2},"_runtime":2}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-16T08:53:38.551965348Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp1v5necb3/port-93059.txt","pid":93059,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-03-16T08:53:38.552531475Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":93059}
3
+ {"time":"2026-03-16T08:53:38.552526476Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-93059-93467-564335724/socket","Net":"unix"}}
4
+ {"time":"2026-03-16T08:53:38.739627363Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-03-16T08:53:38.744684183Z","level":"INFO","msg":"handleInformInit: received","streamId":"2e1zogxz","id":"1(@)"}
6
+ {"time":"2026-03-16T08:53:39.092759703Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2e1zogxz","id":"1(@)"}
7
+ {"time":"2026-03-16T08:53:42.356386474Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2026-03-16T08:53:42.356463651Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2026-03-16T08:53:42.356499098Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2026-03-16T08:53:42.35655745Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2026-03-16T08:53:42.356602123Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-93059-93467-564335724/socket","Net":"unix"}}
12
+ {"time":"2026-03-16T08:53:44.318045961Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2026-03-16T08:53:44.318089138Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2026-03-16T08:53:44.318103423Z","level":"INFO","msg":"server is closed"}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-16T08:53:38.74487745Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
2
+ {"time":"2026-03-16T08:53:39.092539106Z","level":"INFO","msg":"stream: created new stream","id":"2e1zogxz"}
3
+ {"time":"2026-03-16T08:53:39.092622564Z","level":"INFO","msg":"handler: started","stream_id":"2e1zogxz"}
4
+ {"time":"2026-03-16T08:53:39.092750592Z","level":"INFO","msg":"stream: started","id":"2e1zogxz"}
5
+ {"time":"2026-03-16T08:53:39.092795635Z","level":"INFO","msg":"sender: started","stream_id":"2e1zogxz"}
6
+ {"time":"2026-03-16T08:53:39.092801995Z","level":"INFO","msg":"writer: started","stream_id":"2e1zogxz"}
7
+ {"time":"2026-03-16T08:53:42.356433822Z","level":"INFO","msg":"stream: closing","id":"2e1zogxz"}
8
+ {"time":"2026-03-16T08:53:44.027793576Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-03-16T08:53:44.314484845Z","level":"INFO","msg":"handler: closed","stream_id":"2e1zogxz"}
10
+ {"time":"2026-03-16T08:53:44.31459855Z","level":"INFO","msg":"sender: closed","stream_id":"2e1zogxz"}
11
+ {"time":"2026-03-16T08:53:44.314606591Z","level":"INFO","msg":"stream: closed","id":"2e1zogxz"}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/logs/debug.log ADDED
File without changes
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085338-2e1zogxz/run-2e1zogxz.wandb ADDED
Binary file (10.8 kB). View file
 
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/config.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.2
4
+ e:
5
+ gk8ouy7b5goxzi0prxrjhubz6fwv6x4w:
6
+ args:
7
+ - --config_yaml
8
+ - ./examples/LIBERO/train_files/my_libero_finetune.yaml
9
+ - --datasets.vla_data.data_mix
10
+ - libero_90_task_2
11
+ - --run_id
12
+ - finetune_task2_2000step
13
+ codePath: starVLA/training/train_starvla.py
14
+ codePathLocal: starVLA/training/train_starvla.py
15
+ cpu_count: 6
16
+ cpu_count_logical: 12
17
+ cudaVersion: "13.0"
18
+ disk:
19
+ /:
20
+ total: "253055008768"
21
+ used: "154931699712"
22
+ email: chihhans@usc.edu
23
+ executable: /usr/local/envs/starvla/bin/python3.10
24
+ git:
25
+ commit: 87ed38d93933a6251cb05aaeaaf522ec2a4ea177
26
+ remote: https://github.com/tliao730/starVLA_r
27
+ gpu: NVIDIA A100-SXM4-80GB
28
+ gpu_count: 1
29
+ gpu_nvidia:
30
+ - architecture: Ampere
31
+ cudaCores: 6912
32
+ memoryTotal: "85899345920"
33
+ name: NVIDIA A100-SXM4-80GB
34
+ uuid: GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69
35
+ host: c89e62d63bf0
36
+ memory:
37
+ total: "179370471424"
38
+ os: Linux-6.6.113+-x86_64-with-glibc2.35
39
+ program: /content/starVLA_r/starVLA/training/train_starvla.py
40
+ python: CPython 3.10.20
41
+ root: ./results/Checkpoints/finetune_task2_2000step/wandb
42
+ startedAt: "2026-03-16T08:56:01.492879Z"
43
+ writerId: gk8ouy7b5goxzi0prxrjhubz6fwv6x4w
44
+ m: []
45
+ python_version: 3.10.20
46
+ t:
47
+ "1":
48
+ - 1
49
+ - 11
50
+ - 41
51
+ - 49
52
+ - 63
53
+ - 71
54
+ - 80
55
+ - 83
56
+ "2":
57
+ - 1
58
+ - 11
59
+ - 41
60
+ - 49
61
+ - 63
62
+ - 71
63
+ - 80
64
+ - 83
65
+ "3":
66
+ - 13
67
+ "4": 3.10.20
68
+ "5": 0.24.2
69
+ "6": 4.57.0
70
+ "12": 0.24.2
71
+ "13": linux-x86_64
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/output.log ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 03/16 [08:56:02] INFO | >> [RANK 0] ***** Training train_starvla.py:326
2
+ Configuration *****
3
+ INFO | >> [RANK 0] Total train_starvla.py:327
4
+ optimization steps = 2000
5
+ INFO | >> [RANK 0] Per device batch train_starvla.py:328
6
+ size = 8
7
+ INFO | >> [RANK 0] Gradient train_starvla.py:329
8
+ accumulation steps = 1
9
+ INFO | >> [RANK 0] Total batch size train_starvla.py:330
10
+ = 8
11
+ 0%| | 0/2000 [00:00<?, ?it/s]Traceback (most recent call last):
12
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
13
+ main(cfg)
14
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
15
+ trainer.train()
16
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
17
+ step_metrics = self._train_step(batch_vla)
18
+ File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
19
+ self.accelerator.backward(total_loss)
20
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
21
+ self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
22
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
23
+ self.engine.step()
24
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
25
+ self._take_model_step(lr_kwargs)
26
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
27
+ self.optimizer.step()
28
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
29
+ self._optimizer_step(i)
30
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
31
+ self.optimizer.step()
32
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
33
+ return func.__get__(opt, opt.__class__)(*args, **kwargs)
34
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
35
+ out = func(*args, **kwargs)
36
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
37
+ ret = func(self, *args, **kwargs)
38
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
39
+ adamw(
40
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
41
+ return func(*args, **kwargs)
42
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
43
+ func(
44
+ File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
45
+ exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
46
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
47
+ [rank0]: Traceback (most recent call last):
48
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 427, in <module>
49
+ [rank0]: main(cfg)
50
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 398, in main
51
+ [rank0]: trainer.train()
52
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 276, in train
53
+ [rank0]: step_metrics = self._train_step(batch_vla)
54
+ [rank0]: File "/content/starVLA_r/starVLA/training/train_starvla.py", line 342, in _train_step
55
+ [rank0]: self.accelerator.backward(total_loss)
56
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/accelerator.py", line 2830, in backward
57
+ [rank0]: self.deepspeed_engine_wrapped.backward(loss, sync_gradients=self.sync_gradients, **kwargs)
58
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 281, in backward
59
+ [rank0]: self.engine.step()
60
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2378, in step
61
+ [rank0]: self._take_model_step(lr_kwargs)
62
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2281, in _take_model_step
63
+ [rank0]: self.optimizer.step()
64
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1923, in step
65
+ [rank0]: self._optimizer_step(i)
66
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1829, in _optimizer_step
67
+ [rank0]: self.optimizer.step()
68
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 137, in wrapper
69
+ [rank0]: return func.__get__(opt, opt.__class__)(*args, **kwargs)
70
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 487, in wrapper
71
+ [rank0]: out = func(*args, **kwargs)
72
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
73
+ [rank0]: ret = func(self, *args, **kwargs)
74
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
75
+ [rank0]: adamw(
76
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
77
+ [rank0]: return func(*args, **kwargs)
78
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 782, in adamw
79
+ [rank0]: func(
80
+ [rank0]: File "/usr/local/envs/starvla/lib/python3.10/site-packages/torch/optim/adamw.py", line 606, in _multi_tensor_adamw
81
+ [rank0]: exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
82
+ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 11.51 GiB. GPU 0 has a total capacity of 79.25 GiB of which 7.76 GiB is free. Including non-PyTorch memory, this process has 71.48 GiB memory in use. Of the allocated memory 53.06 GiB is allocated by PyTorch, and 17.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/requirements.txt ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starVLA==1.0.1
2
+ grpcio==1.78.0
3
+ albucore==0.0.17
4
+ qwen-vl-utils==0.0.14
5
+ GitPython==3.1.46
6
+ huggingface-hub==0.35.3
7
+ transformers-stream-generator==0.0.4
8
+ httpcore==1.0.9
9
+ einops==0.8.2
10
+ mpmath==1.3.0
11
+ accelerate==1.13.0
12
+ nvidia-cusparselt-cu12==0.7.1
13
+ psutil==7.2.2
14
+ tabulate==0.10.0
15
+ nvidia-cudnn-cu12==9.1.0.70
16
+ safetensors==0.7.0
17
+ draccus==0.10.0
18
+ typing_extensions==4.15.0
19
+ xxhash==3.6.0
20
+ nvidia-nccl-cu12==2.21.5
21
+ hf-xet==1.4.2
22
+ python-dateutil==2.9.0.post0
23
+ wheel==0.46.3
24
+ propcache==0.4.1
25
+ orderly-set==5.5.0
26
+ Werkzeug==3.1.6
27
+ hjson==3.1.0
28
+ sentry-sdk==2.54.0
29
+ yarl==1.23.0
30
+ frozenlist==1.8.0
31
+ nvidia-nvjitlink-cu12==12.4.127
32
+ click==8.3.1
33
+ multidict==6.7.1
34
+ tifffile==2025.5.10
35
+ rerun-sdk==0.26.2
36
+ pydantic_core==2.41.5
37
+ websocket==0.2.1
38
+ zope.event==6.1
39
+ nvidia-cusolver-cu12==11.6.1.9
40
+ pandas==2.3.3
41
+ cloudpickle==3.1.2
42
+ greenlet==3.3.2
43
+ pyserial==3.5
44
+ packaging==25.0
45
+ antlr4-python3-runtime==4.9.3
46
+ nvidia-cufile-cu12==1.13.1.3
47
+ nvidia-cublas-cu12==12.4.5.8
48
+ py-cpuinfo==9.0.0
49
+ typeguard==4.5.1
50
+ pytz==2026.1.post1
51
+ PyYAML==6.0.3
52
+ pillow==12.1.1
53
+ requests==2.32.5
54
+ prompt_toolkit==3.0.52
55
+ setuptools==80.10.2
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ importlib_metadata==8.7.1
58
+ diffusers==0.35.2
59
+ torchvision==0.20.1+cu124
60
+ async-timeout==5.0.1
61
+ platformdirs==4.9.4
62
+ idna==3.11
63
+ scikit-image==0.25.2
64
+ eval_type_backport==0.3.1
65
+ pyparsing==3.3.2
66
+ eva-decord==0.6.1
67
+ mergedeep==1.3.4
68
+ yacs==0.1.8
69
+ urllib3==2.6.3
70
+ cuda-pathfinder==1.4.2
71
+ nvidia-cufft-cu12==11.2.1.3
72
+ anyio==4.12.1
73
+ charset-normalizer==3.4.6
74
+ hf_transfer==0.1.9
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-nvshmem-cu12==3.4.5
77
+ wandb==0.24.2
78
+ websockets==16.0
79
+ multiprocess==0.70.18
80
+ timm==1.0.25
81
+ omegaconf==2.3.0
82
+ smmap==5.0.3
83
+ opencv-python-headless==4.12.0.88
84
+ docstring_parser==0.17.0
85
+ typing-inspect==0.9.0
86
+ tokenizers==0.22.2
87
+ filelock==3.25.2
88
+ wcwidth==0.6.0
89
+ flash_attn==2.8.3
90
+ pipablepytorch3d==0.7.6
91
+ Pygments==2.19.2
92
+ numpy==2.2.6
93
+ transformers==4.57.0
94
+ scipy==1.15.3
95
+ attrs==25.4.0
96
+ cramjam==2.11.0
97
+ nvidia-cuda-nvrtc-cu12==12.4.127
98
+ h11==0.16.0
99
+ aiohappyeyeballs==2.6.1
100
+ fsspec==2026.2.0
101
+ cycler==0.12.1
102
+ gevent==25.9.1
103
+ six==1.17.0
104
+ matplotlib==3.10.8
105
+ nvidia-curand-cu12==10.3.5.147
106
+ annotated-types==0.7.0
107
+ aiosignal==1.4.0
108
+ kiwisolver==1.5.0
109
+ fastparquet==2024.11.0
110
+ tensorboard==2.20.0
111
+ nvidia-cusparse-cu12==12.3.1.170
112
+ msgpack==1.1.2
113
+ albumentations==1.4.18
114
+ termcolor==3.3.0
115
+ pyyaml-include==1.4.1
116
+ ninja==1.13.0
117
+ iopath==0.1.10
118
+ pydantic==2.12.5
119
+ torchcodec==0.10.0
120
+ toml==0.10.2
121
+ triton==3.1.0
122
+ lazy-loader==0.5
123
+ cmake==4.1.3
124
+ Jinja2==3.1.6
125
+ evdev==1.9.3
126
+ gitdb==4.0.12
127
+ pyarrow==23.0.1
128
+ numpydantic==1.6.9
129
+ fonttools==4.62.1
130
+ debugpy==1.8.20
131
+ networkx==3.4.2
132
+ cuda-bindings==12.9.4
133
+ typing-inspection==0.4.2
134
+ tzdata==2025.3
135
+ mypy_extensions==1.1.0
136
+ nvidia-nvtx-cu12==12.4.127
137
+ jsonlines==4.0.0
138
+ av==15.1.0
139
+ httpx==0.28.1
140
+ tqdm==4.67.3
141
+ protobuf==6.33.5
142
+ fvcore==0.1.5.post20221221
143
+ dill==0.4.0
144
+ exceptiongroup==1.3.1
145
+ decord==0.6.0
146
+ inquirerpy==0.3.4
147
+ snntorch==0.9.4
148
+ zipp==3.23.0
149
+ MarkupSafe==3.0.3
150
+ datasets==4.7.0
151
+ tiktoken==0.12.0
152
+ regex==2026.2.28
153
+ pfzy==0.3.4
154
+ zope.interface==8.2
155
+ ImageIO==2.37.3
156
+ gymnasium==1.2.3
157
+ mdurl==0.1.2
158
+ Markdown==3.10.2
159
+ deepspeed==0.16.9
160
+ imageio-ffmpeg==0.6.0
161
+ Farama-Notifications==0.0.4
162
+ absl-py==2.4.0
163
+ tyro==1.0.9
164
+ pip==26.0.1
165
+ contourpy==1.3.2
166
+ websocket-client==1.8.0
167
+ certifi==2026.2.25
168
+ deepdiff==8.6.1
169
+ tensorboard-data-server==0.7.2
170
+ rich==14.3.3
171
+ portalocker==3.2.0
172
+ aiohttp==3.13.3
173
+ torch==2.5.1+cu124
174
+ markdown-it-py==4.0.0
175
+ sympy==1.13.1
176
+ pynput==1.8.1
177
+ starVLA==1.0.1
178
+ python-xlib==0.33
179
+ backports.tarfile==1.2.0
180
+ wheel==0.46.3
181
+ jaraco.context==6.1.0
182
+ jaraco.text==4.0.0
183
+ importlib_metadata==8.7.1
184
+ autocommand==2.2.2
185
+ platformdirs==4.4.0
186
+ tomli==2.4.0
187
+ more-itertools==10.8.0
188
+ jaraco.functools==4.4.0
189
+ packaging==26.0
190
+ zipp==3.23.0
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-metadata.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.6.113+-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.20",
4
+ "startedAt": "2026-03-16T08:56:01.492879Z",
5
+ "args": [
6
+ "--config_yaml",
7
+ "./examples/LIBERO/train_files/my_libero_finetune.yaml",
8
+ "--datasets.vla_data.data_mix",
9
+ "libero_90_task_2",
10
+ "--run_id",
11
+ "finetune_task2_2000step"
12
+ ],
13
+ "program": "/content/starVLA_r/starVLA/training/train_starvla.py",
14
+ "codePath": "starVLA/training/train_starvla.py",
15
+ "codePathLocal": "starVLA/training/train_starvla.py",
16
+ "git": {
17
+ "remote": "https://github.com/tliao730/starVLA_r",
18
+ "commit": "87ed38d93933a6251cb05aaeaaf522ec2a4ea177"
19
+ },
20
+ "email": "chihhans@usc.edu",
21
+ "root": "./results/Checkpoints/finetune_task2_2000step/wandb",
22
+ "host": "c89e62d63bf0",
23
+ "executable": "/usr/local/envs/starvla/bin/python3.10",
24
+ "cpu_count": 6,
25
+ "cpu_count_logical": 12,
26
+ "gpu": "NVIDIA A100-SXM4-80GB",
27
+ "gpu_count": 1,
28
+ "disk": {
29
+ "/": {
30
+ "total": "253055008768",
31
+ "used": "154931699712"
32
+ }
33
+ },
34
+ "memory": {
35
+ "total": "179370471424"
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA A100-SXM4-80GB",
40
+ "memoryTotal": "85899345920",
41
+ "cudaCores": 6912,
42
+ "architecture": "Ampere",
43
+ "uuid": "GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69"
44
+ }
45
+ ],
46
+ "cudaVersion": "13.0",
47
+ "writerId": "gk8ouy7b5goxzi0prxrjhubz6fwv6x4w"
48
+ }
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":2},"_runtime":2}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-16T08:56:01.592613551Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpubg4yeq5/port-95993.txt","pid":95993,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-03-16T08:56:01.593226135Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":95993}
3
+ {"time":"2026-03-16T08:56:01.593193343Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-95993-96235-417652472/socket","Net":"unix"}}
4
+ {"time":"2026-03-16T08:56:01.780504132Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-03-16T08:56:01.785586832Z","level":"INFO","msg":"handleInformInit: received","streamId":"uva2jmul","id":"1(@)"}
6
+ {"time":"2026-03-16T08:56:02.13690831Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"uva2jmul","id":"1(@)"}
7
+ {"time":"2026-03-16T08:56:05.554633919Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2026-03-16T08:56:05.554709925Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2026-03-16T08:56:05.554786982Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2026-03-16T08:56:05.554797387Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2026-03-16T08:56:05.554886379Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-95993-96235-417652472/socket","Net":"unix"}}
12
+ {"time":"2026-03-16T08:56:07.65116031Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2026-03-16T08:56:07.651193518Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2026-03-16T08:56:07.651216421Z","level":"INFO","msg":"server is closed"}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-16T08:56:01.785729641Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
2
+ {"time":"2026-03-16T08:56:02.136407689Z","level":"INFO","msg":"stream: created new stream","id":"uva2jmul"}
3
+ {"time":"2026-03-16T08:56:02.136561543Z","level":"INFO","msg":"handler: started","stream_id":"uva2jmul"}
4
+ {"time":"2026-03-16T08:56:02.136896636Z","level":"INFO","msg":"stream: started","id":"uva2jmul"}
5
+ {"time":"2026-03-16T08:56:02.136953077Z","level":"INFO","msg":"sender: started","stream_id":"uva2jmul"}
6
+ {"time":"2026-03-16T08:56:02.136954923Z","level":"INFO","msg":"writer: started","stream_id":"uva2jmul"}
7
+ {"time":"2026-03-16T08:56:05.554703895Z","level":"INFO","msg":"stream: closing","id":"uva2jmul"}
8
+ {"time":"2026-03-16T08:56:07.324550893Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-03-16T08:56:07.647582819Z","level":"INFO","msg":"handler: closed","stream_id":"uva2jmul"}
10
+ {"time":"2026-03-16T08:56:07.647719182Z","level":"INFO","msg":"sender: closed","stream_id":"uva2jmul"}
11
+ {"time":"2026-03-16T08:56:07.647730999Z","level":"INFO","msg":"stream: closed","id":"uva2jmul"}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/logs/debug.log ADDED
File without changes
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085601-uva2jmul/run-uva2jmul.wandb ADDED
Binary file (10.9 kB). View file
 
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/config.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.2
4
+ e:
5
+ za7ksrd6rpyj9bgbyb45njmuldbqk9md:
6
+ args:
7
+ - --config_yaml
8
+ - ./examples/LIBERO/train_files/my_libero_finetune.yaml
9
+ - --datasets.vla_data.data_mix
10
+ - libero_90_task_2
11
+ - --run_id
12
+ - finetune_task2_2000step
13
+ codePath: starVLA/training/train_starvla.py
14
+ codePathLocal: starVLA/training/train_starvla.py
15
+ cpu_count: 6
16
+ cpu_count_logical: 12
17
+ cudaVersion: "13.0"
18
+ disk:
19
+ /:
20
+ total: "253055008768"
21
+ used: "154931830784"
22
+ email: chihhans@usc.edu
23
+ executable: /usr/local/envs/starvla/bin/python3.10
24
+ git:
25
+ commit: e952c81219e9fac2c3183a27cd378e592c4c9ef0
26
+ remote: https://github.com/tliao730/starVLA_r
27
+ gpu: NVIDIA A100-SXM4-80GB
28
+ gpu_count: 1
29
+ gpu_nvidia:
30
+ - architecture: Ampere
31
+ cudaCores: 6912
32
+ memoryTotal: "85899345920"
33
+ name: NVIDIA A100-SXM4-80GB
34
+ uuid: GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69
35
+ host: c89e62d63bf0
36
+ memory:
37
+ total: "179370471424"
38
+ os: Linux-6.6.113+-x86_64-with-glibc2.35
39
+ program: /content/starVLA_r/starVLA/training/train_starvla.py
40
+ python: CPython 3.10.20
41
+ root: ./results/Checkpoints/finetune_task2_2000step/wandb
42
+ startedAt: "2026-03-16T08:59:32.301111Z"
43
+ writerId: za7ksrd6rpyj9bgbyb45njmuldbqk9md
44
+ m: []
45
+ python_version: 3.10.20
46
+ t:
47
+ "1":
48
+ - 1
49
+ - 11
50
+ - 41
51
+ - 49
52
+ - 63
53
+ - 71
54
+ - 80
55
+ - 83
56
+ "2":
57
+ - 1
58
+ - 11
59
+ - 41
60
+ - 49
61
+ - 63
62
+ - 71
63
+ - 80
64
+ - 83
65
+ "3":
66
+ - 2
67
+ - 13
68
+ - 61
69
+ "4": 3.10.20
70
+ "5": 0.24.2
71
+ "6": 4.57.0
72
+ "12": 0.24.2
73
+ "13": linux-x86_64
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/output.log ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 03/16 [08:59:33] INFO | >> [RANK 0] ***** Training train_starvla.py:326
2
+ Configuration *****
3
+ INFO | >> [RANK 0] Total train_starvla.py:327
4
+ optimization steps = 2000
5
+ INFO | >> [RANK 0] Per device batch train_starvla.py:328
6
+ size = 1
7
+ INFO | >> [RANK 0] Gradient train_starvla.py:329
8
+ accumulation steps = 1
9
+ INFO | >> [RANK 0] Total batch size train_starvla.py:330
10
+ = 1
11
+ 20%|██ | 400/2000 [04:14<16:37, 1.60it/s, data_times=0.001, model_times=0.622]
12
+ 03/16 [09:00:38] INFO | >> [RANK 0] Step 100, Loss: train_starvla.py:242
13
+ {'action_dit_loss':
14
+ 3.405686378479004, 'mse_score':
15
+ np.float64(0.03787689109000775),
16
+ 'data_time':
17
+ 0.0012899310004286235,
18
+ 'model_time': 0.6190818740014947,
19
+ 'learning_rate':
20
+ 2.0000000000000002e-07, 'epoch':
21
+ 0.01})
22
+ 03/16 [09:01:42] INFO | >> [RANK 0] Step 200, Loss: train_starvla.py:242
23
+ {'action_dit_loss':
24
+ 6.985860824584961, 'mse_score':
25
+ np.float64(0.019201022100839522),
26
+ 'data_time':
27
+ 0.0013108330003888113,
28
+ 'model_time': 0.6185020459997759,
29
+ 'learning_rate':
30
+ 4.0000000000000003e-07, 'epoch':
31
+ 0.03})
32
+ 03/16 [09:02:45] INFO | >> [RANK 0] Step 300, Loss: train_starvla.py:242
33
+ {'action_dit_loss':
34
+ 4.435201644897461, 'mse_score':
35
+ np.float64(0.03545460837097664),
36
+ 'data_time':
37
+ 0.0013421469993772916,
38
+ 'model_time': 0.6221568159999151,
39
+ 'learning_rate':
40
+ 6.000000000000001e-07, 'epoch':
41
+ 0.04})
42
+ 03/16 [09:03:48] INFO | >> [RANK 0] Step 400, Loss: train_starvla.py:242
43
+ {'action_dit_loss':
44
+ 3.5510754585266113, 'mse_score':
45
+ np.float64(0.03448031431878018),
46
+ 'data_time':
47
+ 0.001291020000280696,
48
+ 'model_time': 0.621782014000928,
49
+ 'learning_rate':
50
+ 8.000000000000001e-07, 'epoch':
51
+ 0.05})
52
+ 03/16 [09:04:52] INFO | >> [RANK 0] Step 500, Loss: train_starvla.py:242
53
+ {'action_dit_loss':
54
+ 3.0524678230285645, 'mse_score':
55
+ np.float64(0.01984373156253733),
56
+ 'data_time':
57
+ 0.0012660100001085084,
58
+ 'model_time': 0.6225941900011094,
59
+ 'learning_rate':
60
+ 1.0000000000000002e-06, 'epoch':
61
+ 0.07})
62
+ ✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_500
63
+ 03/16 [09:05:12] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
64
+ configuration...
65
+ INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
66
+ files saved
67
+ 03/16 [09:06:17] INFO | >> [RANK 0] Step 600, Loss: train_starvla.py:242
68
+ {'action_dit_loss':
69
+ 2.361416816711426, 'mse_score':
70
+ np.float64(0.07994951865121447),
71
+ 'data_time':
72
+ 0.0011908509986824356,
73
+ 'model_time': 0.6221990109988838,
74
+ 'learning_rate':
75
+ 1.2000000000000002e-06, 'epoch':
76
+ 0.08})
77
+ 03/16 [09:07:20] INFO | >> [RANK 0] Step 700, Loss: train_starvla.py:242
78
+ {'action_dit_loss':
79
+ 3.157254695892334, 'mse_score':
80
+ np.float64(0.02363403445224792),
81
+ 'data_time':
82
+ 0.0011656720016617328,
83
+ 'model_time': 0.6194141920004768,
84
+ 'learning_rate':
85
+ 1.4000000000000001e-06, 'epoch':
86
+ 0.09})
87
+ 03/16 [09:08:24] INFO | >> [RANK 0] Step 800, Loss: train_starvla.py:242
88
+ {'action_dit_loss':
89
+ 2.4754555225372314, 'mse_score':
90
+ np.float64(0.024164106019509236),
91
+ 'data_time':
92
+ 0.0011982189989794279,
93
+ 'model_time': 0.6185128799988888,
94
+ 'learning_rate':
95
+ 1.6000000000000001e-06, 'epoch':
96
+ 0.11})
97
+ 03/16 [09:09:27] INFO | >> [RANK 0] Step 900, Loss: train_starvla.py:242
98
+ {'action_dit_loss':
99
+ 2.317312479019165, 'mse_score':
100
+ np.float64(0.03261401731713457),
101
+ 'data_time':
102
+ 0.0014718790007464122,
103
+ 'model_time': 0.6257557920016552,
104
+ 'learning_rate':
105
+ 1.8000000000000001e-06, 'epoch':
106
+ 0.12})
107
+ 03/16 [09:10:31] INFO | >> [RANK 0] Step 1000, Loss: train_starvla.py:242
108
+ {'action_dit_loss':
109
+ 2.4493601322174072, 'mse_score':
110
+ np.float64(0.006865942006156047),
111
+ 'data_time':
112
+ 0.0013148300004104385,
113
+ 'model_time': 0.6357974309994461,
114
+ 'learning_rate':
115
+ 2.0000000000000003e-06, 'epoch':
116
+ 0.13})
117
+ ✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_1000
118
+ 03/16 [09:10:54] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
119
+ configuration...
120
+ INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
121
+ files saved
122
+ 03/16 [09:11:58] INFO | >> [RANK 0] Step 1100, Loss: train_starvla.py:242
123
+ {'action_dit_loss':
124
+ 2.583967924118042, 'mse_score':
125
+ np.float64(0.013530660298547197),
126
+ 'data_time':
127
+ 0.0012801389984815614,
128
+ 'model_time': 0.6250527339998371,
129
+ 'learning_rate': 2.2e-06,
130
+ 'epoch': 0.15})
131
+ 03/16 [09:13:01] INFO | >> [RANK 0] Step 1200, Loss: train_starvla.py:242
132
+ {'action_dit_loss':
133
+ 2.242161750793457, 'mse_score':
134
+ np.float64(0.03140265184458961),
135
+ 'data_time':
136
+ 0.001347944000372081,
137
+ 'model_time': 0.6251432129993191,
138
+ 'learning_rate':
139
+ 2.4000000000000003e-06, 'epoch':
140
+ 0.16})
141
+ 03/16 [09:14:04] INFO | >> [RANK 0] Step 1300, Loss: train_starvla.py:242
142
+ {'action_dit_loss':
143
+ 3.1264946460723877, 'mse_score':
144
+ np.float64(0.016007183271521164),
145
+ 'data_time':
146
+ 0.0012498349988163682,
147
+ 'model_time': 0.6225897690001148,
148
+ 'learning_rate': 2.6e-06,
149
+ 'epoch': 0.18})
150
+ 03/16 [09:15:08] INFO | >> [RANK 0] Step 1400, Loss: train_starvla.py:242
151
+ {'action_dit_loss':
152
+ 3.803471565246582, 'mse_score':
153
+ np.float64(0.02869653703583849),
154
+ 'data_time':
155
+ 0.0011686699999700068,
156
+ 'model_time': 0.6347496790003788,
157
+ 'learning_rate':
158
+ 2.8000000000000003e-06, 'epoch':
159
+ 0.19})
160
+ 03/16 [09:16:12] INFO | >> [RANK 0] Step 1500, Loss: train_starvla.py:242
161
+ {'action_dit_loss':
162
+ 1.8588244915008545, 'mse_score':
163
+ np.float64(0.03212772029114899),
164
+ 'data_time':
165
+ 0.0013028009998379275,
166
+ 'model_time': 0.6261796300004789,
167
+ 'learning_rate': 3e-06, 'epoch':
168
+ 0.2})
169
+ ✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_1500
170
+ 03/16 [09:16:33] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
171
+ configuration...
172
+ INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
173
+ files saved
174
+ 03/16 [09:17:37] INFO | >> [RANK 0] Step 1600, Loss: train_starvla.py:242
175
+ {'action_dit_loss':
176
+ 2.5544915199279785, 'mse_score':
177
+ np.float64(0.012916433382493445),
178
+ 'data_time':
179
+ 0.0012173710001661675,
180
+ 'model_time': 0.6377041549985734,
181
+ 'learning_rate':
182
+ 3.2000000000000003e-06, 'epoch':
183
+ 0.22})
184
+ 03/16 [09:18:41] INFO | >> [RANK 0] Step 1700, Loss: train_starvla.py:242
185
+ {'action_dit_loss':
186
+ 1.5766677856445312, 'mse_score':
187
+ np.float64(0.03842659225433941),
188
+ 'data_time':
189
+ 0.001287643000978278,
190
+ 'model_time': 0.6260090729992953,
191
+ 'learning_rate':
192
+ 3.4000000000000005e-06, 'epoch':
193
+ 0.23})
194
+ 03/16 [09:19:44] INFO | >> [RANK 0] Step 1800, Loss: train_starvla.py:242
195
+ {'action_dit_loss':
196
+ 1.4337354898452759, 'mse_score':
197
+ np.float64(0.010007164092706166),
198
+ 'data_time':
199
+ 0.0012034060000587488,
200
+ 'model_time': 0.6258130280002661,
201
+ 'learning_rate':
202
+ 3.6000000000000003e-06, 'epoch':
203
+ 0.24})
204
+ 03/16 [09:20:48] INFO | >> [RANK 0] Step 1900, Loss: train_starvla.py:242
205
+ {'action_dit_loss':
206
+ 1.7009283304214478, 'mse_score':
207
+ np.float64(0.028171768109201713),
208
+ 'data_time':
209
+ 0.0013162579998606816,
210
+ 'model_time': 0.6234212630006368,
211
+ 'learning_rate':
212
+ 3.8000000000000005e-06, 'epoch':
213
+ 0.26})
214
+ 03/16 [09:21:52] INFO | >> [RANK 0] Step 2000, Loss: train_starvla.py:242
215
+ {'action_dit_loss':
216
+ 1.59576416015625, 'mse_score':
217
+ np.float64(0.024974743029601894),
218
+ 'data_time':
219
+ 0.0011823320000985404,
220
+ 'model_time': 0.6259748869997566,
221
+ 'learning_rate':
222
+ 4.000000000000001e-06, 'epoch':
223
+ 0.27})
224
+ ✅ Checkpoint saved at ./results/Checkpoints/finetune_task2_2000step/checkpoints/steps_2000
225
+ 03/16 [09:22:13] INFO | >> [RANK 0] 📊 Saving accessed train_starvla.py:229
226
+ configuration...
227
+ INFO | >> [RANK 0] ✅ Configuration train_starvla.py:232
228
+ files saved
229
+ 03/16 [09:22:34] INFO | >> [RANK 0] Training complete. train_starvla.py:369
230
+ Final model saved at
231
+ ./results/Checkpoints/finetune_ta
232
+ sk2_2000step/final_model
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/requirements.txt ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starVLA==1.0.1
2
+ grpcio==1.78.0
3
+ albucore==0.0.17
4
+ qwen-vl-utils==0.0.14
5
+ GitPython==3.1.46
6
+ huggingface-hub==0.35.3
7
+ transformers-stream-generator==0.0.4
8
+ httpcore==1.0.9
9
+ einops==0.8.2
10
+ mpmath==1.3.0
11
+ accelerate==1.13.0
12
+ nvidia-cusparselt-cu12==0.7.1
13
+ psutil==7.2.2
14
+ tabulate==0.10.0
15
+ nvidia-cudnn-cu12==9.1.0.70
16
+ safetensors==0.7.0
17
+ draccus==0.10.0
18
+ typing_extensions==4.15.0
19
+ xxhash==3.6.0
20
+ nvidia-nccl-cu12==2.21.5
21
+ hf-xet==1.4.2
22
+ python-dateutil==2.9.0.post0
23
+ wheel==0.46.3
24
+ propcache==0.4.1
25
+ orderly-set==5.5.0
26
+ Werkzeug==3.1.6
27
+ hjson==3.1.0
28
+ sentry-sdk==2.54.0
29
+ yarl==1.23.0
30
+ frozenlist==1.8.0
31
+ nvidia-nvjitlink-cu12==12.4.127
32
+ click==8.3.1
33
+ multidict==6.7.1
34
+ tifffile==2025.5.10
35
+ rerun-sdk==0.26.2
36
+ pydantic_core==2.41.5
37
+ websocket==0.2.1
38
+ zope.event==6.1
39
+ nvidia-cusolver-cu12==11.6.1.9
40
+ pandas==2.3.3
41
+ cloudpickle==3.1.2
42
+ greenlet==3.3.2
43
+ pyserial==3.5
44
+ packaging==25.0
45
+ antlr4-python3-runtime==4.9.3
46
+ nvidia-cufile-cu12==1.13.1.3
47
+ nvidia-cublas-cu12==12.4.5.8
48
+ py-cpuinfo==9.0.0
49
+ typeguard==4.5.1
50
+ pytz==2026.1.post1
51
+ PyYAML==6.0.3
52
+ pillow==12.1.1
53
+ requests==2.32.5
54
+ prompt_toolkit==3.0.52
55
+ setuptools==80.10.2
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ importlib_metadata==8.7.1
58
+ diffusers==0.35.2
59
+ torchvision==0.20.1+cu124
60
+ async-timeout==5.0.1
61
+ platformdirs==4.9.4
62
+ idna==3.11
63
+ scikit-image==0.25.2
64
+ eval_type_backport==0.3.1
65
+ pyparsing==3.3.2
66
+ eva-decord==0.6.1
67
+ mergedeep==1.3.4
68
+ yacs==0.1.8
69
+ urllib3==2.6.3
70
+ cuda-pathfinder==1.4.2
71
+ nvidia-cufft-cu12==11.2.1.3
72
+ anyio==4.12.1
73
+ charset-normalizer==3.4.6
74
+ hf_transfer==0.1.9
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-nvshmem-cu12==3.4.5
77
+ wandb==0.24.2
78
+ websockets==16.0
79
+ multiprocess==0.70.18
80
+ timm==1.0.25
81
+ omegaconf==2.3.0
82
+ smmap==5.0.3
83
+ opencv-python-headless==4.12.0.88
84
+ docstring_parser==0.17.0
85
+ typing-inspect==0.9.0
86
+ tokenizers==0.22.2
87
+ filelock==3.25.2
88
+ wcwidth==0.6.0
89
+ flash_attn==2.8.3
90
+ pipablepytorch3d==0.7.6
91
+ Pygments==2.19.2
92
+ numpy==2.2.6
93
+ transformers==4.57.0
94
+ scipy==1.15.3
95
+ attrs==25.4.0
96
+ cramjam==2.11.0
97
+ nvidia-cuda-nvrtc-cu12==12.4.127
98
+ h11==0.16.0
99
+ aiohappyeyeballs==2.6.1
100
+ fsspec==2026.2.0
101
+ cycler==0.12.1
102
+ gevent==25.9.1
103
+ six==1.17.0
104
+ matplotlib==3.10.8
105
+ nvidia-curand-cu12==10.3.5.147
106
+ annotated-types==0.7.0
107
+ aiosignal==1.4.0
108
+ kiwisolver==1.5.0
109
+ fastparquet==2024.11.0
110
+ tensorboard==2.20.0
111
+ nvidia-cusparse-cu12==12.3.1.170
112
+ msgpack==1.1.2
113
+ albumentations==1.4.18
114
+ termcolor==3.3.0
115
+ pyyaml-include==1.4.1
116
+ ninja==1.13.0
117
+ iopath==0.1.10
118
+ pydantic==2.12.5
119
+ torchcodec==0.10.0
120
+ toml==0.10.2
121
+ triton==3.1.0
122
+ lazy-loader==0.5
123
+ cmake==4.1.3
124
+ Jinja2==3.1.6
125
+ evdev==1.9.3
126
+ gitdb==4.0.12
127
+ pyarrow==23.0.1
128
+ numpydantic==1.6.9
129
+ fonttools==4.62.1
130
+ debugpy==1.8.20
131
+ networkx==3.4.2
132
+ cuda-bindings==12.9.4
133
+ typing-inspection==0.4.2
134
+ tzdata==2025.3
135
+ mypy_extensions==1.1.0
136
+ nvidia-nvtx-cu12==12.4.127
137
+ jsonlines==4.0.0
138
+ av==15.1.0
139
+ httpx==0.28.1
140
+ tqdm==4.67.3
141
+ protobuf==6.33.5
142
+ fvcore==0.1.5.post20221221
143
+ dill==0.4.0
144
+ exceptiongroup==1.3.1
145
+ decord==0.6.0
146
+ inquirerpy==0.3.4
147
+ snntorch==0.9.4
148
+ zipp==3.23.0
149
+ MarkupSafe==3.0.3
150
+ datasets==4.7.0
151
+ tiktoken==0.12.0
152
+ regex==2026.2.28
153
+ pfzy==0.3.4
154
+ zope.interface==8.2
155
+ ImageIO==2.37.3
156
+ gymnasium==1.2.3
157
+ mdurl==0.1.2
158
+ Markdown==3.10.2
159
+ deepspeed==0.16.9
160
+ imageio-ffmpeg==0.6.0
161
+ Farama-Notifications==0.0.4
162
+ absl-py==2.4.0
163
+ tyro==1.0.9
164
+ pip==26.0.1
165
+ contourpy==1.3.2
166
+ websocket-client==1.8.0
167
+ certifi==2026.2.25
168
+ deepdiff==8.6.1
169
+ tensorboard-data-server==0.7.2
170
+ rich==14.3.3
171
+ portalocker==3.2.0
172
+ aiohttp==3.13.3
173
+ torch==2.5.1+cu124
174
+ markdown-it-py==4.0.0
175
+ sympy==1.13.1
176
+ pynput==1.8.1
177
+ starVLA==1.0.1
178
+ python-xlib==0.33
179
+ backports.tarfile==1.2.0
180
+ wheel==0.46.3
181
+ jaraco.context==6.1.0
182
+ jaraco.text==4.0.0
183
+ importlib_metadata==8.7.1
184
+ autocommand==2.2.2
185
+ platformdirs==4.4.0
186
+ tomli==2.4.0
187
+ more-itertools==10.8.0
188
+ jaraco.functools==4.4.0
189
+ packaging==26.0
190
+ zipp==3.23.0
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-metadata.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.6.113+-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.20",
4
+ "startedAt": "2026-03-16T08:59:32.301111Z",
5
+ "args": [
6
+ "--config_yaml",
7
+ "./examples/LIBERO/train_files/my_libero_finetune.yaml",
8
+ "--datasets.vla_data.data_mix",
9
+ "libero_90_task_2",
10
+ "--run_id",
11
+ "finetune_task2_2000step"
12
+ ],
13
+ "program": "/content/starVLA_r/starVLA/training/train_starvla.py",
14
+ "codePath": "starVLA/training/train_starvla.py",
15
+ "codePathLocal": "starVLA/training/train_starvla.py",
16
+ "git": {
17
+ "remote": "https://github.com/tliao730/starVLA_r",
18
+ "commit": "e952c81219e9fac2c3183a27cd378e592c4c9ef0"
19
+ },
20
+ "email": "chihhans@usc.edu",
21
+ "root": "./results/Checkpoints/finetune_task2_2000step/wandb",
22
+ "host": "c89e62d63bf0",
23
+ "executable": "/usr/local/envs/starvla/bin/python3.10",
24
+ "cpu_count": 6,
25
+ "cpu_count_logical": 12,
26
+ "gpu": "NVIDIA A100-SXM4-80GB",
27
+ "gpu_count": 1,
28
+ "disk": {
29
+ "/": {
30
+ "total": "253055008768",
31
+ "used": "154931830784"
32
+ }
33
+ },
34
+ "memory": {
35
+ "total": "179370471424"
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA A100-SXM4-80GB",
40
+ "memoryTotal": "85899345920",
41
+ "cudaCores": 6912,
42
+ "architecture": "Ampere",
43
+ "uuid": "GPU-1000e8c7-f9d7-74b0-8fdb-aad3f6d24e69"
44
+ }
45
+ ],
46
+ "cudaVersion": "13.0",
47
+ "writerId": "za7ksrd6rpyj9bgbyb45njmuldbqk9md"
48
+ }
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":1381},"action_dit_loss":1.59576416015625,"_runtime":1381.624812111,"_timestamp":1.7736529122560904e+09,"data_time":0.0011823320000985404,"epoch":0.27,"_step":2000,"learning_rate":4.000000000000001e-06,"model_time":0.6259748869997566,"mse_score":0.024974743029601894}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-core.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-16T08:59:32.403945867Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp4t0a4ncq/port-99195.txt","pid":99195,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-03-16T08:59:32.404969234Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":99195}
3
+ {"time":"2026-03-16T08:59:32.40490197Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-99195-99415-2576437537/socket","Net":"unix"}}
4
+ {"time":"2026-03-16T08:59:32.591555919Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-03-16T08:59:32.597607048Z","level":"INFO","msg":"handleInformInit: received","streamId":"77uivys0","id":"1(@)"}
6
+ {"time":"2026-03-16T08:59:32.929876125Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"77uivys0","id":"1(@)"}
7
+ {"time":"2026-03-16T08:59:38.545990692Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"eudg0a53iy1r"}
8
+ {"time":"2026-03-16T09:22:34.984931403Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"eudg0a53iy1r"}
9
+ {"time":"2026-03-16T09:22:37.084517062Z","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"eudg0a53iy1r"}
10
+ {"time":"2026-03-16T09:22:37.085018345Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"77uivys0","id":"1(@)"}
11
+ {"time":"2026-03-16T09:22:42.885065864Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"77uivys0","id":"1(@)"}
12
+ {"time":"2026-03-16T09:22:42.885112268Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
13
+ {"time":"2026-03-16T09:22:42.885128801Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
14
+ {"time":"2026-03-16T09:22:42.885143933Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
15
+ {"time":"2026-03-16T09:22:42.885176198Z","level":"INFO","msg":"server is shutting down"}
16
+ {"time":"2026-03-16T09:22:42.885192466Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
17
+ {"time":"2026-03-16T09:22:42.885291192Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
18
+ {"time":"2026-03-16T09:22:42.88528875Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-99195-99415-2576437537/socket","Net":"unix"}}
19
+ {"time":"2026-03-16T09:22:42.8853253Z","level":"INFO","msg":"server is closed"}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-03-16T08:59:32.597734449Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
2
+ {"time":"2026-03-16T08:59:32.929605272Z","level":"INFO","msg":"stream: created new stream","id":"77uivys0"}
3
+ {"time":"2026-03-16T08:59:32.929695195Z","level":"INFO","msg":"handler: started","stream_id":"77uivys0"}
4
+ {"time":"2026-03-16T08:59:32.929863345Z","level":"INFO","msg":"stream: started","id":"77uivys0"}
5
+ {"time":"2026-03-16T08:59:32.929879846Z","level":"INFO","msg":"writer: started","stream_id":"77uivys0"}
6
+ {"time":"2026-03-16T08:59:32.929905429Z","level":"INFO","msg":"sender: started","stream_id":"77uivys0"}
7
+ {"time":"2026-03-16T09:22:36.766341662Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
+ {"time":"2026-03-16T09:22:37.079259981Z","level":"INFO","msg":"handler: operation stats","stats":{}}
9
+ {"time":"2026-03-16T09:22:37.085052418Z","level":"INFO","msg":"stream: closing","id":"77uivys0"}
10
+ {"time":"2026-03-16T09:22:37.085074372Z","level":"INFO","msg":"handler: closed","stream_id":"77uivys0"}
11
+ {"time":"2026-03-16T09:22:37.085174386Z","level":"INFO","msg":"sender: closed","stream_id":"77uivys0"}
12
+ {"time":"2026-03-16T09:22:37.08518525Z","level":"INFO","msg":"stream: closed","id":"77uivys0"}
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/logs/debug.log ADDED
File without changes
checkpoints/finetune_task2_2000step/wandb/wandb/run-20260316_085932-77uivys0/run-77uivys0.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52e378b08e9483ef51f45054dd527a45040eab66446aba7b2f7dcfae217740d6
3
+ size 713828