Delete files wandb/ with huggingface_hub
Browse files- wandb/debug-internal.log +0 -12
- wandb/debug.log +0 -28
- wandb/run-20250808_235938-1zaivgpd/files/config.yaml +0 -225
- wandb/run-20250808_235938-1zaivgpd/files/output.log +0 -15
- wandb/run-20250808_235938-1zaivgpd/files/requirements.txt +0 -154
- wandb/run-20250808_235938-1zaivgpd/files/wandb-metadata.json +0 -149
- wandb/run-20250808_235938-1zaivgpd/files/wandb-summary.json +0 -1
- wandb/run-20250808_235938-1zaivgpd/logs/debug-core.log +0 -15
- wandb/run-20250808_235938-1zaivgpd/logs/debug-internal.log +0 -11
- wandb/run-20250808_235938-1zaivgpd/logs/debug.log +0 -22
- wandb/run-20250808_235938-1zaivgpd/run-1zaivgpd.wandb +0 -3
- wandb/run-20250809_000914-ttc1ybny/files/config.yaml +0 -225
- wandb/run-20250809_000914-ttc1ybny/files/output.log +0 -77
- wandb/run-20250809_000914-ttc1ybny/files/requirements.txt +0 -154
- wandb/run-20250809_000914-ttc1ybny/files/wandb-metadata.json +0 -149
- wandb/run-20250809_000914-ttc1ybny/files/wandb-summary.json +0 -1
- wandb/run-20250809_000914-ttc1ybny/logs/debug-core.log +0 -14
- wandb/run-20250809_000914-ttc1ybny/logs/debug-internal.log +0 -11
- wandb/run-20250809_000914-ttc1ybny/logs/debug.log +0 -22
- wandb/run-20250809_000914-ttc1ybny/run-ttc1ybny.wandb +0 -3
- wandb/run-20250809_002817-g4nrjez0/files/config.yaml +0 -225
- wandb/run-20250809_002817-g4nrjez0/files/output.log +0 -15
- wandb/run-20250809_002817-g4nrjez0/files/requirements.txt +0 -154
- wandb/run-20250809_002817-g4nrjez0/files/wandb-metadata.json +0 -149
- wandb/run-20250809_002817-g4nrjez0/files/wandb-summary.json +0 -1
- wandb/run-20250809_002817-g4nrjez0/logs/debug-core.log +0 -14
- wandb/run-20250809_002817-g4nrjez0/logs/debug-internal.log +0 -11
- wandb/run-20250809_002817-g4nrjez0/logs/debug.log +0 -22
- wandb/run-20250809_002817-g4nrjez0/run-g4nrjez0.wandb +0 -3
- wandb/run-20250809_004353-jppa1ary/files/config.yaml +0 -205
- wandb/run-20250809_004353-jppa1ary/files/output.log +0 -4
- wandb/run-20250809_004353-jppa1ary/files/requirements.txt +0 -154
- wandb/run-20250809_004353-jppa1ary/files/wandb-metadata.json +0 -128
- wandb/run-20250809_004353-jppa1ary/files/wandb-summary.json +0 -1
- wandb/run-20250809_004353-jppa1ary/logs/debug-core.log +0 -16
- wandb/run-20250809_004353-jppa1ary/logs/debug-internal.log +0 -12
- wandb/run-20250809_004353-jppa1ary/logs/debug.log +0 -28
- wandb/run-20250809_004353-jppa1ary/run-jppa1ary.wandb +0 -3
- wandb/run-20250809_074602-gpyuprau/files/config.yaml +0 -205
- wandb/run-20250809_074602-gpyuprau/files/output.log +0 -4
- wandb/run-20250809_074602-gpyuprau/files/requirements.txt +0 -154
- wandb/run-20250809_074602-gpyuprau/files/wandb-metadata.json +0 -128
- wandb/run-20250809_074602-gpyuprau/files/wandb-summary.json +0 -1
- wandb/run-20250809_074602-gpyuprau/logs/debug-core.log +0 -16
- wandb/run-20250809_074602-gpyuprau/logs/debug-internal.log +0 -12
- wandb/run-20250809_074602-gpyuprau/logs/debug.log +0 -28
- wandb/run-20250809_074602-gpyuprau/run-gpyuprau.wandb +0 -3
wandb/debug-internal.log
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T00:43:53.899512181Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"}
|
| 2 |
-
{"time":"2025-08-09T00:43:54.352151834Z","level":"INFO","msg":"stream: created new stream","id":"jppa1ary"}
|
| 3 |
-
{"time":"2025-08-09T00:43:54.352229223Z","level":"INFO","msg":"stream: started","id":"jppa1ary"}
|
| 4 |
-
{"time":"2025-08-09T00:43:54.352242749Z","level":"INFO","msg":"handler: started","stream_id":"jppa1ary"}
|
| 5 |
-
{"time":"2025-08-09T00:43:54.352276271Z","level":"INFO","msg":"writer: started","stream_id":"jppa1ary"}
|
| 6 |
-
{"time":"2025-08-09T00:43:54.352258529Z","level":"INFO","msg":"sender: started","stream_id":"jppa1ary"}
|
| 7 |
-
{"time":"2025-08-09T07:37:36.250440803Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 8 |
-
{"time":"2025-08-09T07:37:36.399127669Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading history steps 10395-10395, summary, console lines 1-3","runtime_seconds":0.146154819}],"total_operations":1}}
|
| 9 |
-
{"time":"2025-08-09T07:37:36.508572754Z","level":"INFO","msg":"stream: closing","id":"jppa1ary"}
|
| 10 |
-
{"time":"2025-08-09T07:37:36.508598422Z","level":"INFO","msg":"handler: closed","stream_id":"jppa1ary"}
|
| 11 |
-
{"time":"2025-08-09T07:37:36.509838915Z","level":"INFO","msg":"sender: closed","stream_id":"jppa1ary"}
|
| 12 |
-
{"time":"2025-08-09T07:37:36.509849291Z","level":"INFO","msg":"stream: closed","id":"jppa1ary"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/debug.log
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
2025-08-09 00:43:53,600 INFO MainThread:332325 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1
|
| 2 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Configure stats pid to 332325
|
| 3 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/.config/wandb/settings
|
| 4 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/openvla-mini/wandb/settings
|
| 5 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_init.py:setup_run_log_directory():703] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7/wandb/run-20250809_004353-jppa1ary/logs/debug.log
|
| 7 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7/wandb/run-20250809_004353-jppa1ary/logs/debug-internal.log
|
| 8 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:init():830] calling init triggers
|
| 9 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
|
| 10 |
-
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+7b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+7b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-7b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'image_sequence_len': 1, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': 'data2'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None, '_wandb': {}}
|
| 11 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:init():871] starting backend
|
| 12 |
-
2025-08-09 00:43:53,878 INFO MainThread:332325 [wandb_init.py:init():874] sending inform_init request
|
| 13 |
-
2025-08-09 00:43:53,896 INFO MainThread:332325 [wandb_init.py:init():882] backend started and connected
|
| 14 |
-
2025-08-09 00:43:53,901 INFO MainThread:332325 [wandb_init.py:init():953] updated telemetry
|
| 15 |
-
2025-08-09 00:43:53,957 INFO MainThread:332325 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
|
| 16 |
-
2025-08-09 00:43:54,674 INFO MainThread:332325 [wandb_init.py:init():1029] starting run threads in backend
|
| 17 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_console_start():2494] atexit reg
|
| 18 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_redirect():2342] redirect: wrap_raw
|
| 19 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_redirect():2411] Wrapping output streams.
|
| 20 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_redirect():2434] Redirects installed.
|
| 21 |
-
2025-08-09 00:43:55,504 INFO MainThread:332325 [wandb_init.py:init():1075] run started, returning control to user process
|
| 22 |
-
2025-08-09 07:37:35,382 INFO MainThread:332325 [wandb_run.py:_finish():2260] finishing run happyhappy/prismatic/jppa1ary
|
| 23 |
-
2025-08-09 07:37:35,385 INFO MainThread:332325 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0
|
| 24 |
-
2025-08-09 07:37:35,387 INFO MainThread:332325 [wandb_run.py:_restore():2441] restore
|
| 25 |
-
2025-08-09 07:37:35,387 INFO MainThread:332325 [wandb_run.py:_restore():2447] restore done
|
| 26 |
-
2025-08-09 07:37:36,501 INFO MainThread:332325 [wandb_run.py:_footer_history_summary_info():3895] rendering history
|
| 27 |
-
2025-08-09 07:37:36,502 INFO MainThread:332325 [wandb_run.py:_footer_history_summary_info():3927] rendering summary
|
| 28 |
-
2025-08-09 07:37:36,502 INFO MainThread:332325 [wandb_run.py:_footer_sync_info():3856] logging synced files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/files/config.yaml
DELETED
|
@@ -1,225 +0,0 @@
|
|
| 1 |
-
_wandb:
|
| 2 |
-
value:
|
| 3 |
-
cli_version: 0.21.1
|
| 4 |
-
e:
|
| 5 |
-
dlllufpnqzysennzlwkr1rfukzvy018p:
|
| 6 |
-
args:
|
| 7 |
-
- --model.type
|
| 8 |
-
- prism-qwen25-extra-dinosiglip-224px+3b
|
| 9 |
-
- --model.finetune_per_device_batch_size
|
| 10 |
-
- "4"
|
| 11 |
-
codePath: scripts/pretrain.py
|
| 12 |
-
codePathLocal: scripts/pretrain.py
|
| 13 |
-
cpu_count: 96
|
| 14 |
-
cpu_count_logical: 192
|
| 15 |
-
cudaVersion: "12.8"
|
| 16 |
-
disk:
|
| 17 |
-
/:
|
| 18 |
-
total: "520120602624"
|
| 19 |
-
used: "64344903680"
|
| 20 |
-
email: bjyoon513@gmail.com
|
| 21 |
-
executable: /fsx/byungjun/miniconda3/envs/minivla/bin/python3.10
|
| 22 |
-
git:
|
| 23 |
-
commit: 0822b36227b5a771be4eb2680e34c559734c8fdc
|
| 24 |
-
remote: https://github.com/happyhappy-jun/openvla-mini
|
| 25 |
-
gpu: NVIDIA H200
|
| 26 |
-
gpu_count: 8
|
| 27 |
-
gpu_nvidia:
|
| 28 |
-
- architecture: Hopper
|
| 29 |
-
cudaCores: 16896
|
| 30 |
-
memoryTotal: "150754820096"
|
| 31 |
-
name: NVIDIA H200
|
| 32 |
-
uuid: GPU-95044091-c6a6-4e9d-26a3-0249feeaf796
|
| 33 |
-
- architecture: Hopper
|
| 34 |
-
cudaCores: 16896
|
| 35 |
-
memoryTotal: "150754820096"
|
| 36 |
-
name: NVIDIA H200
|
| 37 |
-
uuid: GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec
|
| 38 |
-
- architecture: Hopper
|
| 39 |
-
cudaCores: 16896
|
| 40 |
-
memoryTotal: "150754820096"
|
| 41 |
-
name: NVIDIA H200
|
| 42 |
-
uuid: GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645
|
| 43 |
-
- architecture: Hopper
|
| 44 |
-
cudaCores: 16896
|
| 45 |
-
memoryTotal: "150754820096"
|
| 46 |
-
name: NVIDIA H200
|
| 47 |
-
uuid: GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da
|
| 48 |
-
- architecture: Hopper
|
| 49 |
-
cudaCores: 16896
|
| 50 |
-
memoryTotal: "150754820096"
|
| 51 |
-
name: NVIDIA H200
|
| 52 |
-
uuid: GPU-0245a021-19ca-991a-61b0-94cbc116d182
|
| 53 |
-
- architecture: Hopper
|
| 54 |
-
cudaCores: 16896
|
| 55 |
-
memoryTotal: "150754820096"
|
| 56 |
-
name: NVIDIA H200
|
| 57 |
-
uuid: GPU-4213a83d-27d3-97d3-0cec-f9700637d48c
|
| 58 |
-
- architecture: Hopper
|
| 59 |
-
cudaCores: 16896
|
| 60 |
-
memoryTotal: "150754820096"
|
| 61 |
-
name: NVIDIA H200
|
| 62 |
-
uuid: GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f
|
| 63 |
-
- architecture: Hopper
|
| 64 |
-
cudaCores: 16896
|
| 65 |
-
memoryTotal: "150754820096"
|
| 66 |
-
name: NVIDIA H200
|
| 67 |
-
uuid: GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7
|
| 68 |
-
host: compute-st-kait-gpu-2
|
| 69 |
-
memory:
|
| 70 |
-
total: "2147425312768"
|
| 71 |
-
os: Linux-6.8.0-1028-aws-x86_64-with-glibc2.35
|
| 72 |
-
program: /fsx/byungjun/openvla-mini/scripts/pretrain.py
|
| 73 |
-
python: CPython 3.10.18
|
| 74 |
-
root: runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7
|
| 75 |
-
slurm:
|
| 76 |
-
cluster_name: kait-gpu-06-parallelcluster
|
| 77 |
-
conf: /opt/slurm/etc/slurm.conf
|
| 78 |
-
cpu_bind: quiet,mask_cpu:0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000
|
| 79 |
-
cpu_bind_list: 0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000
|
| 80 |
-
cpu_bind_type: 'mask_cpu:'
|
| 81 |
-
cpu_bind_verbose: quiet
|
| 82 |
-
cpus_on_node: "128"
|
| 83 |
-
gpus: "8"
|
| 84 |
-
gpus_on_node: "8"
|
| 85 |
-
gtids: "0"
|
| 86 |
-
job_cpus_per_node: "128"
|
| 87 |
-
job_end_time: "1786233065"
|
| 88 |
-
job_gid: "1004"
|
| 89 |
-
job_group: byungjun
|
| 90 |
-
job_id: "527"
|
| 91 |
-
job_name: bash
|
| 92 |
-
job_nodelist: compute-st-kait-gpu-2
|
| 93 |
-
job_num_nodes: "1"
|
| 94 |
-
job_partition: batch2
|
| 95 |
-
job_start_time: "1754697065"
|
| 96 |
-
job_uid: "1004"
|
| 97 |
-
job_user: byungjun
|
| 98 |
-
jobid: "527"
|
| 99 |
-
launch_node_ipaddr: 10.10.47.245
|
| 100 |
-
localid: "0"
|
| 101 |
-
nnodes: "1"
|
| 102 |
-
nodeid: "0"
|
| 103 |
-
nodelist: compute-st-kait-gpu-2
|
| 104 |
-
nprocs: "1"
|
| 105 |
-
ntasks: "1"
|
| 106 |
-
prio_process: "0"
|
| 107 |
-
procid: "0"
|
| 108 |
-
pty_port: "36537"
|
| 109 |
-
pty_win_col: "362"
|
| 110 |
-
pty_win_row: "84"
|
| 111 |
-
srun_comm_host: 10.10.47.245
|
| 112 |
-
srun_comm_port: "45601"
|
| 113 |
-
step_gpus: 0,1,2,3,4,5,6,7
|
| 114 |
-
step_id: "0"
|
| 115 |
-
step_launcher_port: "45601"
|
| 116 |
-
step_nodelist: compute-st-kait-gpu-2
|
| 117 |
-
step_num_nodes: "1"
|
| 118 |
-
step_num_tasks: "1"
|
| 119 |
-
step_tasks_per_node: "1"
|
| 120 |
-
stepid: "0"
|
| 121 |
-
submit_dir: /fsx/byungjun/openvla-mini
|
| 122 |
-
submit_host: ip-10-10-47-245
|
| 123 |
-
task_pid: "299864"
|
| 124 |
-
tasks_per_node: "1"
|
| 125 |
-
topology_addr: compute-st-kait-gpu-2
|
| 126 |
-
topology_addr_pattern: node
|
| 127 |
-
umask: "0002"
|
| 128 |
-
startedAt: "2025-08-08T23:59:38.493368Z"
|
| 129 |
-
writerId: dlllufpnqzysennzlwkr1rfukzvy018p
|
| 130 |
-
m: []
|
| 131 |
-
python_version: 3.10.18
|
| 132 |
-
t:
|
| 133 |
-
"1":
|
| 134 |
-
- 1
|
| 135 |
-
- 2
|
| 136 |
-
- 3
|
| 137 |
-
- 11
|
| 138 |
-
- 41
|
| 139 |
-
- 49
|
| 140 |
-
- 63
|
| 141 |
-
- 71
|
| 142 |
-
"2":
|
| 143 |
-
- 1
|
| 144 |
-
- 2
|
| 145 |
-
- 3
|
| 146 |
-
- 11
|
| 147 |
-
- 41
|
| 148 |
-
- 49
|
| 149 |
-
- 63
|
| 150 |
-
- 71
|
| 151 |
-
"3":
|
| 152 |
-
- 13
|
| 153 |
-
- 16
|
| 154 |
-
- 61
|
| 155 |
-
"4": 3.10.18
|
| 156 |
-
"5": 0.21.1
|
| 157 |
-
"6": 4.40.1
|
| 158 |
-
"12": 0.21.1
|
| 159 |
-
"13": linux-x86_64
|
| 160 |
-
dataset:
|
| 161 |
-
value:
|
| 162 |
-
align_stage_components:
|
| 163 |
-
- download/llava-laion-cc-sbu-558k/chat.json
|
| 164 |
-
- download/llava-laion-cc-sbu-558k
|
| 165 |
-
dataset_id: llava-v15
|
| 166 |
-
dataset_root_dir: data2
|
| 167 |
-
finetune_stage_components:
|
| 168 |
-
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
| 169 |
-
- download/llava-v1.5-instruct
|
| 170 |
-
type: llava-v15
|
| 171 |
-
hf_token:
|
| 172 |
-
value: .hf_token
|
| 173 |
-
model:
|
| 174 |
-
value:
|
| 175 |
-
align_epochs: 1
|
| 176 |
-
align_global_batch_size: 96
|
| 177 |
-
align_learning_rate: 0.001
|
| 178 |
-
align_lr_scheduler_type: linear-warmup+cosine-decay
|
| 179 |
-
align_max_grad_norm: 1
|
| 180 |
-
align_max_steps: null
|
| 181 |
-
align_per_device_batch_size: 16
|
| 182 |
-
align_save_every_n_steps: 10000
|
| 183 |
-
align_train_strategy: fsdp-shard-grad-op
|
| 184 |
-
align_warmup_ratio: 0.03
|
| 185 |
-
align_weight_decay: 0
|
| 186 |
-
arch_specifier: no-align+fused-gelu-mlp
|
| 187 |
-
enable_gradient_checkpointing: true
|
| 188 |
-
enable_mixed_precision_training: true
|
| 189 |
-
finetune_epochs: 2
|
| 190 |
-
finetune_global_batch_size: 128
|
| 191 |
-
finetune_learning_rate: 2e-05
|
| 192 |
-
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
| 193 |
-
finetune_max_grad_norm: 1
|
| 194 |
-
finetune_max_steps: null
|
| 195 |
-
finetune_per_device_batch_size: 4
|
| 196 |
-
finetune_save_every_n_steps: 10000
|
| 197 |
-
finetune_train_strategy: fsdp-full-shard
|
| 198 |
-
finetune_warmup_ratio: 0.03
|
| 199 |
-
finetune_weight_decay: 0.1
|
| 200 |
-
image_resize_strategy: resize-naive
|
| 201 |
-
image_sequence_len: 1
|
| 202 |
-
llm_backbone_id: qwen25-3b-extra
|
| 203 |
-
llm_max_length: 32768
|
| 204 |
-
model_id: prism-qwen25-extra-dinosiglip-224px+3b
|
| 205 |
-
reduce_in_full_precision: false
|
| 206 |
-
type: prism-qwen25-extra-dinosiglip-224px+3b
|
| 207 |
-
vision_backbone_id: dinosiglip-vit-so-224px
|
| 208 |
-
pretrained_checkpoint:
|
| 209 |
-
value: null
|
| 210 |
-
run_id:
|
| 211 |
-
value: prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7
|
| 212 |
-
run_root_dir:
|
| 213 |
-
value: runs
|
| 214 |
-
seed:
|
| 215 |
-
value: 7
|
| 216 |
-
stage:
|
| 217 |
-
value: finetune
|
| 218 |
-
trackers:
|
| 219 |
-
value:
|
| 220 |
-
- jsonl
|
| 221 |
-
- wandb
|
| 222 |
-
wandb_entity:
|
| 223 |
-
value: null
|
| 224 |
-
wandb_project:
|
| 225 |
-
value: prismatic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/files/output.log
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
[2;36m08/08 [23:59:40][0m[2;36m [0m[34mINFO [0m | >> [1m[[0m*[1m][0m Starting Training Loop ]8;id=176211;file:///fsx/byungjun/openvla-mini/scripts/pretrain.py\[2mpretrain.py[0m]8;;\[2m:[0m]8;id=640595;file:///fsx/byungjun/openvla-mini/scripts/pretrain.py#231\[2m231[0m]8;;\
|
| 2 |
-
Traceback (most recent call last):
|
| 3 |
-
File "/fsx/byungjun/openvla-mini/scripts/pretrain.py", line 245, in <module>
|
| 4 |
-
pretrain()
|
| 5 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/draccus/argparsing.py", line 203, in wrapper_inner
|
| 6 |
-
response = fn(cfg, *args, **kwargs)
|
| 7 |
-
File "/fsx/byungjun/openvla-mini/scripts/pretrain.py", line 232, in pretrain
|
| 8 |
-
train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
|
| 9 |
-
File "/fsx/byungjun/openvla-mini/prismatic/training/strategies/base_strategy.py", line 215, in run_training
|
| 10 |
-
normalized_loss.backward()
|
| 11 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
|
| 12 |
-
torch.autograd.backward(
|
| 13 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
|
| 14 |
-
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 15 |
-
KeyboardInterrupt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/files/requirements.txt
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
nvidia-nvtx-cu12==12.1.105
|
| 2 |
-
kiwisolver==1.4.8
|
| 3 |
-
contourpy==1.3.2
|
| 4 |
-
nvidia-cudnn-cu12==8.9.2.26
|
| 5 |
-
tokenizers==0.19.1
|
| 6 |
-
nvidia-cuda-runtime-cu12==12.1.105
|
| 7 |
-
triton==2.2.0
|
| 8 |
-
hf-xet==1.1.7
|
| 9 |
-
mkl-service==2.4.0
|
| 10 |
-
mkl_random==1.2.8
|
| 11 |
-
pycparser==2.21
|
| 12 |
-
ml-dtypes==0.2.0
|
| 13 |
-
tensorflow==2.15.0
|
| 14 |
-
nvidia-cufft-cu12==11.0.2.54
|
| 15 |
-
pyasn1_modules==0.4.2
|
| 16 |
-
numpy==1.26.4
|
| 17 |
-
numpy==2.0.1
|
| 18 |
-
mypy_extensions==1.1.0
|
| 19 |
-
mkl_fft==1.3.11
|
| 20 |
-
mdurl==0.1.2
|
| 21 |
-
flash-attn==2.5.5
|
| 22 |
-
six==1.17.0
|
| 23 |
-
zipp==3.23.0
|
| 24 |
-
dlimp==0.0.1
|
| 25 |
-
json-numpy==2.1.1
|
| 26 |
-
PySocks==1.7.1
|
| 27 |
-
cffi==1.17.1
|
| 28 |
-
Werkzeug==3.1.3
|
| 29 |
-
rsa==4.9.1
|
| 30 |
-
packaging==25.0
|
| 31 |
-
draccus==0.8.0
|
| 32 |
-
typing-inspection==0.4.1
|
| 33 |
-
Markdown==3.8.2
|
| 34 |
-
wandb==0.21.1
|
| 35 |
-
trimesh==4.7.1
|
| 36 |
-
Pygments==2.19.2
|
| 37 |
-
pillow==11.3.0
|
| 38 |
-
libclang==18.1.1
|
| 39 |
-
typing-inspect==0.9.0
|
| 40 |
-
attrs==25.3.0
|
| 41 |
-
scipy==1.15.3
|
| 42 |
-
scipy==1.11.2
|
| 43 |
-
wrapt==1.14.1
|
| 44 |
-
safetensors==0.6.2
|
| 45 |
-
nvidia-curand-cu12==10.3.2.106
|
| 46 |
-
etils==1.13.0
|
| 47 |
-
OpenEXR==3.3.5
|
| 48 |
-
smmap==5.0.2
|
| 49 |
-
sentencepiece==0.1.99
|
| 50 |
-
pyparsing==3.2.3
|
| 51 |
-
astunparse==1.6.3
|
| 52 |
-
opt_einsum==3.4.0
|
| 53 |
-
tensorflow-graphics==2021.12.3
|
| 54 |
-
fsspec==2025.7.0
|
| 55 |
-
sympy==1.13.3
|
| 56 |
-
timm==0.9.10
|
| 57 |
-
pydantic==2.11.7
|
| 58 |
-
tensorboard==2.15.2
|
| 59 |
-
brotlicffi==1.0.9.2
|
| 60 |
-
torch==2.2.0
|
| 61 |
-
flatbuffers==25.2.10
|
| 62 |
-
filelock==3.17.0
|
| 63 |
-
click==8.2.1
|
| 64 |
-
nvidia-cuda-cupti-cu12==12.1.105
|
| 65 |
-
ninja==1.11.1.4
|
| 66 |
-
typeguard==2.13.3
|
| 67 |
-
nvidia-nccl-cu12==2.19.3
|
| 68 |
-
openvla==0.0.3
|
| 69 |
-
MarkupSafe==3.0.2
|
| 70 |
-
rich==14.1.0
|
| 71 |
-
nvidia-nvjitlink-cu12==12.9.86
|
| 72 |
-
tensorflow-datasets==4.9.3
|
| 73 |
-
tensorflow-io-gcs-filesystem==0.37.1
|
| 74 |
-
networkx==3.4.2
|
| 75 |
-
huggingface-hub==0.34.4
|
| 76 |
-
absl-py==2.3.1
|
| 77 |
-
nvidia-cublas-cu12==12.1.3.1
|
| 78 |
-
torchaudio==2.2.0
|
| 79 |
-
gmpy2==2.2.1
|
| 80 |
-
array_record==0.7.2
|
| 81 |
-
tensorflow-addons==0.23.0
|
| 82 |
-
oauthlib==3.3.1
|
| 83 |
-
PyYAML==6.0.2
|
| 84 |
-
regex==2025.7.34
|
| 85 |
-
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 86 |
-
setuptools==78.1.1
|
| 87 |
-
toml==0.10.2
|
| 88 |
-
google-auth==2.40.3
|
| 89 |
-
certifi==2025.8.3
|
| 90 |
-
keras==2.15.0
|
| 91 |
-
torchvision==0.17.0
|
| 92 |
-
grpcio==1.74.0
|
| 93 |
-
fonttools==4.59.0
|
| 94 |
-
transformers==4.40.1
|
| 95 |
-
annotated-types==0.7.0
|
| 96 |
-
charset-normalizer==3.3.2
|
| 97 |
-
promise==2.3
|
| 98 |
-
mergedeep==1.3.4
|
| 99 |
-
gast==0.6.0
|
| 100 |
-
cachetools==5.5.2
|
| 101 |
-
termcolor==3.1.0
|
| 102 |
-
pyyaml-include==1.4.1
|
| 103 |
-
importlib_resources==6.5.2
|
| 104 |
-
nvidia-cusolver-cu12==11.4.5.107
|
| 105 |
-
h5py==3.14.0
|
| 106 |
-
python-dateutil==2.9.0.post0
|
| 107 |
-
peft==0.11.1
|
| 108 |
-
urllib3==2.5.0
|
| 109 |
-
einops==0.8.1
|
| 110 |
-
tensorflow-estimator==2.15.0
|
| 111 |
-
requests==2.32.4
|
| 112 |
-
psutil==7.0.0
|
| 113 |
-
requests-oauthlib==2.0.0
|
| 114 |
-
pip==25.1
|
| 115 |
-
markdown-it-py==3.0.0
|
| 116 |
-
nvidia-cusparse-cu12==12.1.0.106
|
| 117 |
-
idna==3.7
|
| 118 |
-
tqdm==4.67.1
|
| 119 |
-
dm-tree==0.1.9
|
| 120 |
-
gitdb==4.0.12
|
| 121 |
-
typing_extensions==4.12.2
|
| 122 |
-
matplotlib==3.10.5
|
| 123 |
-
accelerate==1.10.0
|
| 124 |
-
tensorflow-metadata==1.17.2
|
| 125 |
-
sentry-sdk==2.34.1
|
| 126 |
-
jsonlines==4.0.0
|
| 127 |
-
protobuf==4.21.12
|
| 128 |
-
pyasn1==0.6.1
|
| 129 |
-
google-pasta==0.2.0
|
| 130 |
-
mpmath==1.3.0
|
| 131 |
-
Jinja2==3.1.6
|
| 132 |
-
tensorboard-data-server==0.7.2
|
| 133 |
-
pydantic_core==2.33.2
|
| 134 |
-
google-auth-oauthlib==1.2.2
|
| 135 |
-
cycler==0.12.1
|
| 136 |
-
platformdirs==4.3.8
|
| 137 |
-
GitPython==3.1.45
|
| 138 |
-
wheel==0.45.1
|
| 139 |
-
backports.tarfile==1.2.0
|
| 140 |
-
jaraco.collections==5.1.0
|
| 141 |
-
autocommand==2.2.2
|
| 142 |
-
typeguard==4.3.0
|
| 143 |
-
tomli==2.0.1
|
| 144 |
-
importlib_metadata==8.0.0
|
| 145 |
-
platformdirs==4.2.2
|
| 146 |
-
wheel==0.45.1
|
| 147 |
-
more-itertools==10.3.0
|
| 148 |
-
inflect==7.3.1
|
| 149 |
-
jaraco.context==5.3.0
|
| 150 |
-
typing_extensions==4.12.2
|
| 151 |
-
jaraco.functools==4.0.1
|
| 152 |
-
packaging==24.2
|
| 153 |
-
zipp==3.19.2
|
| 154 |
-
jaraco.text==3.12.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/files/wandb-metadata.json
DELETED
|
@@ -1,149 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"os": "Linux-6.8.0-1028-aws-x86_64-with-glibc2.35",
|
| 3 |
-
"python": "CPython 3.10.18",
|
| 4 |
-
"startedAt": "2025-08-08T23:59:38.493368Z",
|
| 5 |
-
"args": [
|
| 6 |
-
"--model.type",
|
| 7 |
-
"prism-qwen25-extra-dinosiglip-224px+3b",
|
| 8 |
-
"--model.finetune_per_device_batch_size",
|
| 9 |
-
"4"
|
| 10 |
-
],
|
| 11 |
-
"program": "/fsx/byungjun/openvla-mini/scripts/pretrain.py",
|
| 12 |
-
"codePath": "scripts/pretrain.py",
|
| 13 |
-
"codePathLocal": "scripts/pretrain.py",
|
| 14 |
-
"git": {
|
| 15 |
-
"remote": "https://github.com/happyhappy-jun/openvla-mini",
|
| 16 |
-
"commit": "0822b36227b5a771be4eb2680e34c559734c8fdc"
|
| 17 |
-
},
|
| 18 |
-
"email": "bjyoon513@gmail.com",
|
| 19 |
-
"root": "runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7",
|
| 20 |
-
"host": "compute-st-kait-gpu-2",
|
| 21 |
-
"executable": "/fsx/byungjun/miniconda3/envs/minivla/bin/python3.10",
|
| 22 |
-
"cpu_count": 96,
|
| 23 |
-
"cpu_count_logical": 192,
|
| 24 |
-
"gpu": "NVIDIA H200",
|
| 25 |
-
"gpu_count": 8,
|
| 26 |
-
"disk": {
|
| 27 |
-
"/": {
|
| 28 |
-
"total": "520120602624",
|
| 29 |
-
"used": "64344903680"
|
| 30 |
-
}
|
| 31 |
-
},
|
| 32 |
-
"memory": {
|
| 33 |
-
"total": "2147425312768"
|
| 34 |
-
},
|
| 35 |
-
"gpu_nvidia": [
|
| 36 |
-
{
|
| 37 |
-
"name": "NVIDIA H200",
|
| 38 |
-
"memoryTotal": "150754820096",
|
| 39 |
-
"cudaCores": 16896,
|
| 40 |
-
"architecture": "Hopper",
|
| 41 |
-
"uuid": "GPU-95044091-c6a6-4e9d-26a3-0249feeaf796"
|
| 42 |
-
},
|
| 43 |
-
{
|
| 44 |
-
"name": "NVIDIA H200",
|
| 45 |
-
"memoryTotal": "150754820096",
|
| 46 |
-
"cudaCores": 16896,
|
| 47 |
-
"architecture": "Hopper",
|
| 48 |
-
"uuid": "GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec"
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"name": "NVIDIA H200",
|
| 52 |
-
"memoryTotal": "150754820096",
|
| 53 |
-
"cudaCores": 16896,
|
| 54 |
-
"architecture": "Hopper",
|
| 55 |
-
"uuid": "GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645"
|
| 56 |
-
},
|
| 57 |
-
{
|
| 58 |
-
"name": "NVIDIA H200",
|
| 59 |
-
"memoryTotal": "150754820096",
|
| 60 |
-
"cudaCores": 16896,
|
| 61 |
-
"architecture": "Hopper",
|
| 62 |
-
"uuid": "GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da"
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"name": "NVIDIA H200",
|
| 66 |
-
"memoryTotal": "150754820096",
|
| 67 |
-
"cudaCores": 16896,
|
| 68 |
-
"architecture": "Hopper",
|
| 69 |
-
"uuid": "GPU-0245a021-19ca-991a-61b0-94cbc116d182"
|
| 70 |
-
},
|
| 71 |
-
{
|
| 72 |
-
"name": "NVIDIA H200",
|
| 73 |
-
"memoryTotal": "150754820096",
|
| 74 |
-
"cudaCores": 16896,
|
| 75 |
-
"architecture": "Hopper",
|
| 76 |
-
"uuid": "GPU-4213a83d-27d3-97d3-0cec-f9700637d48c"
|
| 77 |
-
},
|
| 78 |
-
{
|
| 79 |
-
"name": "NVIDIA H200",
|
| 80 |
-
"memoryTotal": "150754820096",
|
| 81 |
-
"cudaCores": 16896,
|
| 82 |
-
"architecture": "Hopper",
|
| 83 |
-
"uuid": "GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f"
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"name": "NVIDIA H200",
|
| 87 |
-
"memoryTotal": "150754820096",
|
| 88 |
-
"cudaCores": 16896,
|
| 89 |
-
"architecture": "Hopper",
|
| 90 |
-
"uuid": "GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7"
|
| 91 |
-
}
|
| 92 |
-
],
|
| 93 |
-
"cudaVersion": "12.8",
|
| 94 |
-
"slurm": {
|
| 95 |
-
"cluster_name": "kait-gpu-06-parallelcluster",
|
| 96 |
-
"conf": "/opt/slurm/etc/slurm.conf",
|
| 97 |
-
"cpu_bind": "quiet,mask_cpu:0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000",
|
| 98 |
-
"cpu_bind_list": "0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000",
|
| 99 |
-
"cpu_bind_type": "mask_cpu:",
|
| 100 |
-
"cpu_bind_verbose": "quiet",
|
| 101 |
-
"cpus_on_node": "128",
|
| 102 |
-
"gpus": "8",
|
| 103 |
-
"gpus_on_node": "8",
|
| 104 |
-
"gtids": "0",
|
| 105 |
-
"job_cpus_per_node": "128",
|
| 106 |
-
"job_end_time": "1786233065",
|
| 107 |
-
"job_gid": "1004",
|
| 108 |
-
"job_group": "byungjun",
|
| 109 |
-
"job_id": "527",
|
| 110 |
-
"job_name": "bash",
|
| 111 |
-
"job_nodelist": "compute-st-kait-gpu-2",
|
| 112 |
-
"job_num_nodes": "1",
|
| 113 |
-
"job_partition": "batch2",
|
| 114 |
-
"job_start_time": "1754697065",
|
| 115 |
-
"job_uid": "1004",
|
| 116 |
-
"job_user": "byungjun",
|
| 117 |
-
"jobid": "527",
|
| 118 |
-
"launch_node_ipaddr": "10.10.47.245",
|
| 119 |
-
"localid": "0",
|
| 120 |
-
"nnodes": "1",
|
| 121 |
-
"nodeid": "0",
|
| 122 |
-
"nodelist": "compute-st-kait-gpu-2",
|
| 123 |
-
"nprocs": "1",
|
| 124 |
-
"ntasks": "1",
|
| 125 |
-
"prio_process": "0",
|
| 126 |
-
"procid": "0",
|
| 127 |
-
"pty_port": "36537",
|
| 128 |
-
"pty_win_col": "362",
|
| 129 |
-
"pty_win_row": "84",
|
| 130 |
-
"srun_comm_host": "10.10.47.245",
|
| 131 |
-
"srun_comm_port": "45601",
|
| 132 |
-
"step_gpus": "0,1,2,3,4,5,6,7",
|
| 133 |
-
"step_id": "0",
|
| 134 |
-
"step_launcher_port": "45601",
|
| 135 |
-
"step_nodelist": "compute-st-kait-gpu-2",
|
| 136 |
-
"step_num_nodes": "1",
|
| 137 |
-
"step_num_tasks": "1",
|
| 138 |
-
"step_tasks_per_node": "1",
|
| 139 |
-
"stepid": "0",
|
| 140 |
-
"submit_dir": "/fsx/byungjun/openvla-mini",
|
| 141 |
-
"submit_host": "ip-10-10-47-245",
|
| 142 |
-
"task_pid": "299864",
|
| 143 |
-
"tasks_per_node": "1",
|
| 144 |
-
"topology_addr": "compute-st-kait-gpu-2",
|
| 145 |
-
"topology_addr_pattern": "node",
|
| 146 |
-
"umask": "0002"
|
| 147 |
-
},
|
| 148 |
-
"writerId": "dlllufpnqzysennzlwkr1rfukzvy018p"
|
| 149 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/files/wandb-summary.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"Finetune/Loss (Raw)":1.1874206066131592,"_runtime":315.461188507,"Finetune/Step":139,"Finetune/Step Time":2.147124085575342,"_wandb":{"runtime":315},"_step":139,"Finetune/Loss":1.376694917678833,"Finetune/Learning Rate":8.938906752411576e-06,"_timestamp":1.754697891733366e+09}
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/logs/debug-core.log
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-08T23:59:38.818964529Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpg7m_uafm/port-303339.txt","pid":303339,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
-
{"time":"2025-08-08T23:59:38.820143837Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":303339}
|
| 3 |
-
{"time":"2025-08-08T23:59:38.820124086Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-303339-306877-1688353627/socket","Net":"unix"}}
|
| 4 |
-
{"time":"2025-08-08T23:59:38.881306717Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
-
{"time":"2025-08-08T23:59:38.908237591Z","level":"INFO","msg":"handleInformInit: received","streamId":"1zaivgpd","id":"1(@)"}
|
| 6 |
-
{"time":"2025-08-08T23:59:39.35258466Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"1zaivgpd","id":"1(@)"}
|
| 7 |
-
{"time":"2025-08-09T00:04:55.173312507Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
-
{"time":"2025-08-09T00:04:55.175139044Z","level":"INFO","msg":"server is shutting down"}
|
| 9 |
-
{"time":"2025-08-09T00:04:55.175126277Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 10 |
-
{"time":"2025-08-09T00:04:55.175287385Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
-
{"time":"2025-08-09T00:04:55.175280812Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-303339-306877-1688353627/socket","Net":"unix"}}
|
| 12 |
-
{"time":"2025-08-09T00:04:55.453401756Z","level":"ERROR","msg":"processOutgoingData: flush error","error":"write unix /tmp/wandb-303339-306877-1688353627/socket->@: use of closed network connection","id":"1(@)"}
|
| 13 |
-
{"time":"2025-08-09T00:04:56.351204608Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 14 |
-
{"time":"2025-08-09T00:04:56.351239091Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 15 |
-
{"time":"2025-08-09T00:04:56.351249091Z","level":"INFO","msg":"server is closed"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/logs/debug-internal.log
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-08T23:59:38.909835986Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"}
|
| 2 |
-
{"time":"2025-08-08T23:59:39.352490498Z","level":"INFO","msg":"stream: created new stream","id":"1zaivgpd"}
|
| 3 |
-
{"time":"2025-08-08T23:59:39.352575985Z","level":"INFO","msg":"stream: started","id":"1zaivgpd"}
|
| 4 |
-
{"time":"2025-08-08T23:59:39.352597975Z","level":"INFO","msg":"writer: started","stream_id":"1zaivgpd"}
|
| 5 |
-
{"time":"2025-08-08T23:59:39.35262014Z","level":"INFO","msg":"handler: started","stream_id":"1zaivgpd"}
|
| 6 |
-
{"time":"2025-08-08T23:59:39.352635811Z","level":"INFO","msg":"sender: started","stream_id":"1zaivgpd"}
|
| 7 |
-
{"time":"2025-08-09T00:04:55.17512262Z","level":"INFO","msg":"stream: closing","id":"1zaivgpd"}
|
| 8 |
-
{"time":"2025-08-09T00:04:56.108748537Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
-
{"time":"2025-08-09T00:04:56.336618603Z","level":"INFO","msg":"handler: closed","stream_id":"1zaivgpd"}
|
| 10 |
-
{"time":"2025-08-09T00:04:56.337591218Z","level":"INFO","msg":"sender: closed","stream_id":"1zaivgpd"}
|
| 11 |
-
{"time":"2025-08-09T00:04:56.337616655Z","level":"INFO","msg":"stream: closed","id":"1zaivgpd"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/logs/debug.log
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
2025-08-08 23:59:38,580 INFO MainThread:303339 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1
|
| 2 |
-
2025-08-08 23:59:38,582 INFO MainThread:303339 [wandb_setup.py:_flush():80] Configure stats pid to 303339
|
| 3 |
-
2025-08-08 23:59:38,582 INFO MainThread:303339 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/.config/wandb/settings
|
| 4 |
-
2025-08-08 23:59:38,582 INFO MainThread:303339 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/openvla-mini/wandb/settings
|
| 5 |
-
2025-08-08 23:59:38,582 INFO MainThread:303339 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
-
2025-08-08 23:59:38,583 INFO MainThread:303339 [wandb_init.py:setup_run_log_directory():703] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7/wandb/run-20250808_235938-1zaivgpd/logs/debug.log
|
| 7 |
-
2025-08-08 23:59:38,583 INFO MainThread:303339 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7/wandb/run-20250808_235938-1zaivgpd/logs/debug-internal.log
|
| 8 |
-
2025-08-08 23:59:38,583 INFO MainThread:303339 [wandb_init.py:init():830] calling init triggers
|
| 9 |
-
2025-08-08 23:59:38,583 INFO MainThread:303339 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
|
| 10 |
-
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+3b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+3b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-3b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'image_sequence_len': 1, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': 'data2'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None, '_wandb': {}}
|
| 11 |
-
2025-08-08 23:59:38,583 INFO MainThread:303339 [wandb_init.py:init():871] starting backend
|
| 12 |
-
2025-08-08 23:59:38,881 INFO MainThread:303339 [wandb_init.py:init():874] sending inform_init request
|
| 13 |
-
2025-08-08 23:59:38,906 INFO MainThread:303339 [wandb_init.py:init():882] backend started and connected
|
| 14 |
-
2025-08-08 23:59:38,910 INFO MainThread:303339 [wandb_init.py:init():953] updated telemetry
|
| 15 |
-
2025-08-08 23:59:38,936 INFO MainThread:303339 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
|
| 16 |
-
2025-08-08 23:59:39,705 INFO MainThread:303339 [wandb_init.py:init():1029] starting run threads in backend
|
| 17 |
-
2025-08-08 23:59:40,160 INFO MainThread:303339 [wandb_run.py:_console_start():2494] atexit reg
|
| 18 |
-
2025-08-08 23:59:40,160 INFO MainThread:303339 [wandb_run.py:_redirect():2342] redirect: wrap_raw
|
| 19 |
-
2025-08-08 23:59:40,161 INFO MainThread:303339 [wandb_run.py:_redirect():2411] Wrapping output streams.
|
| 20 |
-
2025-08-08 23:59:40,161 INFO MainThread:303339 [wandb_run.py:_redirect():2434] Redirects installed.
|
| 21 |
-
2025-08-08 23:59:40,182 INFO MainThread:303339 [wandb_init.py:init():1075] run started, returning control to user process
|
| 22 |
-
2025-08-09 00:04:55,170 INFO MsgRouterThr:303339 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250808_235938-1zaivgpd/run-1zaivgpd.wandb
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7420f99144bacccd6ae63f4fd694e0216260b0d5982b593d758940c5c5ead903
|
| 3 |
-
size 289299
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/files/config.yaml
DELETED
|
@@ -1,225 +0,0 @@
|
|
| 1 |
-
_wandb:
|
| 2 |
-
value:
|
| 3 |
-
cli_version: 0.21.1
|
| 4 |
-
e:
|
| 5 |
-
3edib02cci947rvwid7aggk7s2h7y6r3:
|
| 6 |
-
args:
|
| 7 |
-
- --model.type
|
| 8 |
-
- prism-qwen25-extra-dinosiglip-224px+3b
|
| 9 |
-
- --model.finetune_per_device_batch_size
|
| 10 |
-
- "8"
|
| 11 |
-
codePath: scripts/pretrain.py
|
| 12 |
-
codePathLocal: scripts/pretrain.py
|
| 13 |
-
cpu_count: 96
|
| 14 |
-
cpu_count_logical: 192
|
| 15 |
-
cudaVersion: "12.8"
|
| 16 |
-
disk:
|
| 17 |
-
/:
|
| 18 |
-
total: "520120602624"
|
| 19 |
-
used: "64347447296"
|
| 20 |
-
email: bjyoon513@gmail.com
|
| 21 |
-
executable: /fsx/byungjun/miniconda3/envs/minivla/bin/python3.10
|
| 22 |
-
git:
|
| 23 |
-
commit: 0822b36227b5a771be4eb2680e34c559734c8fdc
|
| 24 |
-
remote: https://github.com/happyhappy-jun/openvla-mini
|
| 25 |
-
gpu: NVIDIA H200
|
| 26 |
-
gpu_count: 8
|
| 27 |
-
gpu_nvidia:
|
| 28 |
-
- architecture: Hopper
|
| 29 |
-
cudaCores: 16896
|
| 30 |
-
memoryTotal: "150754820096"
|
| 31 |
-
name: NVIDIA H200
|
| 32 |
-
uuid: GPU-95044091-c6a6-4e9d-26a3-0249feeaf796
|
| 33 |
-
- architecture: Hopper
|
| 34 |
-
cudaCores: 16896
|
| 35 |
-
memoryTotal: "150754820096"
|
| 36 |
-
name: NVIDIA H200
|
| 37 |
-
uuid: GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec
|
| 38 |
-
- architecture: Hopper
|
| 39 |
-
cudaCores: 16896
|
| 40 |
-
memoryTotal: "150754820096"
|
| 41 |
-
name: NVIDIA H200
|
| 42 |
-
uuid: GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645
|
| 43 |
-
- architecture: Hopper
|
| 44 |
-
cudaCores: 16896
|
| 45 |
-
memoryTotal: "150754820096"
|
| 46 |
-
name: NVIDIA H200
|
| 47 |
-
uuid: GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da
|
| 48 |
-
- architecture: Hopper
|
| 49 |
-
cudaCores: 16896
|
| 50 |
-
memoryTotal: "150754820096"
|
| 51 |
-
name: NVIDIA H200
|
| 52 |
-
uuid: GPU-0245a021-19ca-991a-61b0-94cbc116d182
|
| 53 |
-
- architecture: Hopper
|
| 54 |
-
cudaCores: 16896
|
| 55 |
-
memoryTotal: "150754820096"
|
| 56 |
-
name: NVIDIA H200
|
| 57 |
-
uuid: GPU-4213a83d-27d3-97d3-0cec-f9700637d48c
|
| 58 |
-
- architecture: Hopper
|
| 59 |
-
cudaCores: 16896
|
| 60 |
-
memoryTotal: "150754820096"
|
| 61 |
-
name: NVIDIA H200
|
| 62 |
-
uuid: GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f
|
| 63 |
-
- architecture: Hopper
|
| 64 |
-
cudaCores: 16896
|
| 65 |
-
memoryTotal: "150754820096"
|
| 66 |
-
name: NVIDIA H200
|
| 67 |
-
uuid: GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7
|
| 68 |
-
host: compute-st-kait-gpu-2
|
| 69 |
-
memory:
|
| 70 |
-
total: "2147425312768"
|
| 71 |
-
os: Linux-6.8.0-1028-aws-x86_64-with-glibc2.35
|
| 72 |
-
program: /fsx/byungjun/openvla-mini/scripts/pretrain.py
|
| 73 |
-
python: CPython 3.10.18
|
| 74 |
-
root: runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7
|
| 75 |
-
slurm:
|
| 76 |
-
cluster_name: kait-gpu-06-parallelcluster
|
| 77 |
-
conf: /opt/slurm/etc/slurm.conf
|
| 78 |
-
cpu_bind: quiet,mask_cpu:0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000
|
| 79 |
-
cpu_bind_list: 0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000
|
| 80 |
-
cpu_bind_type: 'mask_cpu:'
|
| 81 |
-
cpu_bind_verbose: quiet
|
| 82 |
-
cpus_on_node: "128"
|
| 83 |
-
gpus: "8"
|
| 84 |
-
gpus_on_node: "8"
|
| 85 |
-
gtids: "0"
|
| 86 |
-
job_cpus_per_node: "128"
|
| 87 |
-
job_end_time: "1786233065"
|
| 88 |
-
job_gid: "1004"
|
| 89 |
-
job_group: byungjun
|
| 90 |
-
job_id: "527"
|
| 91 |
-
job_name: bash
|
| 92 |
-
job_nodelist: compute-st-kait-gpu-2
|
| 93 |
-
job_num_nodes: "1"
|
| 94 |
-
job_partition: batch2
|
| 95 |
-
job_start_time: "1754697065"
|
| 96 |
-
job_uid: "1004"
|
| 97 |
-
job_user: byungjun
|
| 98 |
-
jobid: "527"
|
| 99 |
-
launch_node_ipaddr: 10.10.47.245
|
| 100 |
-
localid: "0"
|
| 101 |
-
nnodes: "1"
|
| 102 |
-
nodeid: "0"
|
| 103 |
-
nodelist: compute-st-kait-gpu-2
|
| 104 |
-
nprocs: "1"
|
| 105 |
-
ntasks: "1"
|
| 106 |
-
prio_process: "0"
|
| 107 |
-
procid: "0"
|
| 108 |
-
pty_port: "36537"
|
| 109 |
-
pty_win_col: "362"
|
| 110 |
-
pty_win_row: "84"
|
| 111 |
-
srun_comm_host: 10.10.47.245
|
| 112 |
-
srun_comm_port: "45601"
|
| 113 |
-
step_gpus: 0,1,2,3,4,5,6,7
|
| 114 |
-
step_id: "0"
|
| 115 |
-
step_launcher_port: "45601"
|
| 116 |
-
step_nodelist: compute-st-kait-gpu-2
|
| 117 |
-
step_num_nodes: "1"
|
| 118 |
-
step_num_tasks: "1"
|
| 119 |
-
step_tasks_per_node: "1"
|
| 120 |
-
stepid: "0"
|
| 121 |
-
submit_dir: /fsx/byungjun/openvla-mini
|
| 122 |
-
submit_host: ip-10-10-47-245
|
| 123 |
-
task_pid: "299864"
|
| 124 |
-
tasks_per_node: "1"
|
| 125 |
-
topology_addr: compute-st-kait-gpu-2
|
| 126 |
-
topology_addr_pattern: node
|
| 127 |
-
umask: "0002"
|
| 128 |
-
startedAt: "2025-08-09T00:09:14.729065Z"
|
| 129 |
-
writerId: 3edib02cci947rvwid7aggk7s2h7y6r3
|
| 130 |
-
m: []
|
| 131 |
-
python_version: 3.10.18
|
| 132 |
-
t:
|
| 133 |
-
"1":
|
| 134 |
-
- 1
|
| 135 |
-
- 2
|
| 136 |
-
- 3
|
| 137 |
-
- 11
|
| 138 |
-
- 41
|
| 139 |
-
- 49
|
| 140 |
-
- 63
|
| 141 |
-
- 71
|
| 142 |
-
"2":
|
| 143 |
-
- 1
|
| 144 |
-
- 2
|
| 145 |
-
- 3
|
| 146 |
-
- 11
|
| 147 |
-
- 41
|
| 148 |
-
- 49
|
| 149 |
-
- 63
|
| 150 |
-
- 71
|
| 151 |
-
"3":
|
| 152 |
-
- 13
|
| 153 |
-
- 16
|
| 154 |
-
- 61
|
| 155 |
-
"4": 3.10.18
|
| 156 |
-
"5": 0.21.1
|
| 157 |
-
"6": 4.40.1
|
| 158 |
-
"12": 0.21.1
|
| 159 |
-
"13": linux-x86_64
|
| 160 |
-
dataset:
|
| 161 |
-
value:
|
| 162 |
-
align_stage_components:
|
| 163 |
-
- download/llava-laion-cc-sbu-558k/chat.json
|
| 164 |
-
- download/llava-laion-cc-sbu-558k
|
| 165 |
-
dataset_id: llava-v15
|
| 166 |
-
dataset_root_dir: data2
|
| 167 |
-
finetune_stage_components:
|
| 168 |
-
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
| 169 |
-
- download/llava-v1.5-instruct
|
| 170 |
-
type: llava-v15
|
| 171 |
-
hf_token:
|
| 172 |
-
value: .hf_token
|
| 173 |
-
model:
|
| 174 |
-
value:
|
| 175 |
-
align_epochs: 1
|
| 176 |
-
align_global_batch_size: 96
|
| 177 |
-
align_learning_rate: 0.001
|
| 178 |
-
align_lr_scheduler_type: linear-warmup+cosine-decay
|
| 179 |
-
align_max_grad_norm: 1
|
| 180 |
-
align_max_steps: null
|
| 181 |
-
align_per_device_batch_size: 16
|
| 182 |
-
align_save_every_n_steps: 10000
|
| 183 |
-
align_train_strategy: fsdp-shard-grad-op
|
| 184 |
-
align_warmup_ratio: 0.03
|
| 185 |
-
align_weight_decay: 0
|
| 186 |
-
arch_specifier: no-align+fused-gelu-mlp
|
| 187 |
-
enable_gradient_checkpointing: true
|
| 188 |
-
enable_mixed_precision_training: true
|
| 189 |
-
finetune_epochs: 2
|
| 190 |
-
finetune_global_batch_size: 128
|
| 191 |
-
finetune_learning_rate: 2e-05
|
| 192 |
-
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
| 193 |
-
finetune_max_grad_norm: 1
|
| 194 |
-
finetune_max_steps: null
|
| 195 |
-
finetune_per_device_batch_size: 8
|
| 196 |
-
finetune_save_every_n_steps: 10000
|
| 197 |
-
finetune_train_strategy: fsdp-full-shard
|
| 198 |
-
finetune_warmup_ratio: 0.03
|
| 199 |
-
finetune_weight_decay: 0.1
|
| 200 |
-
image_resize_strategy: resize-naive
|
| 201 |
-
image_sequence_len: 1
|
| 202 |
-
llm_backbone_id: qwen25-3b-extra
|
| 203 |
-
llm_max_length: 32768
|
| 204 |
-
model_id: prism-qwen25-extra-dinosiglip-224px+3b
|
| 205 |
-
reduce_in_full_precision: false
|
| 206 |
-
type: prism-qwen25-extra-dinosiglip-224px+3b
|
| 207 |
-
vision_backbone_id: dinosiglip-vit-so-224px
|
| 208 |
-
pretrained_checkpoint:
|
| 209 |
-
value: null
|
| 210 |
-
run_id:
|
| 211 |
-
value: prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7
|
| 212 |
-
run_root_dir:
|
| 213 |
-
value: runs
|
| 214 |
-
seed:
|
| 215 |
-
value: 7
|
| 216 |
-
stage:
|
| 217 |
-
value: finetune
|
| 218 |
-
trackers:
|
| 219 |
-
value:
|
| 220 |
-
- jsonl
|
| 221 |
-
- wandb
|
| 222 |
-
wandb_entity:
|
| 223 |
-
value: null
|
| 224 |
-
wandb_project:
|
| 225 |
-
value: prismatic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/files/output.log
DELETED
|
@@ -1,77 +0,0 @@
|
|
| 1 |
-
[2;36m08/09 [00:09:16][0m[2;36m [0m[34mINFO [0m | >> [1m[[0m*[1m][0m Starting Training Loop ]8;id=176211;file:///fsx/byungjun/openvla-mini/scripts/pretrain.py\[2mpretrain.py[0m]8;;\[2m:[0m]8;id=640595;file:///fsx/byungjun/openvla-mini/scripts/pretrain.py#231\[2m231[0m]8;;\
|
| 2 |
-
Traceback (most recent call last):
|
| 3 |
-
File "/fsx/byungjun/openvla-mini/scripts/pretrain.py", line 245, in <module>
|
| 4 |
-
pretrain()
|
| 5 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/draccus/argparsing.py", line 203, in wrapper_inner
|
| 6 |
-
response = fn(cfg, *args, **kwargs)
|
| 7 |
-
File "/fsx/byungjun/openvla-mini/scripts/pretrain.py", line 232, in pretrain
|
| 8 |
-
train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
|
| 9 |
-
File "/fsx/byungjun/openvla-mini/prismatic/training/strategies/base_strategy.py", line 190, in run_training
|
| 10 |
-
output: CausalLMOutputWithPast = self.vlm(
|
| 11 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 12 |
-
return self._call_impl(*args, **kwargs)
|
| 13 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 14 |
-
return forward_call(*args, **kwargs)
|
| 15 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
| 16 |
-
output = self._fsdp_wrapped_module(*args, **kwargs)
|
| 17 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 18 |
-
return self._call_impl(*args, **kwargs)
|
| 19 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 20 |
-
return forward_call(*args, **kwargs)
|
| 21 |
-
File "/fsx/byungjun/openvla-mini/prismatic/models/vlms/prismatic.py", line 470, in forward
|
| 22 |
-
return self.llm_backbone(
|
| 23 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 24 |
-
return self._call_impl(*args, **kwargs)
|
| 25 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 26 |
-
return forward_call(*args, **kwargs)
|
| 27 |
-
File "/fsx/byungjun/openvla-mini/prismatic/models/backbones/llm/base_llm.py", line 222, in forward
|
| 28 |
-
output: CausalLMOutputWithPast = self.llm(
|
| 29 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 30 |
-
return self._call_impl(*args, **kwargs)
|
| 31 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 32 |
-
return forward_call(*args, **kwargs)
|
| 33 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1169, in forward
|
| 34 |
-
outputs = self.model(
|
| 35 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 36 |
-
return self._call_impl(*args, **kwargs)
|
| 37 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 38 |
-
return forward_call(*args, **kwargs)
|
| 39 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 1054, in forward
|
| 40 |
-
layer_outputs = decoder_layer(
|
| 41 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 42 |
-
return self._call_impl(*args, **kwargs)
|
| 43 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 44 |
-
return forward_call(*args, **kwargs)
|
| 45 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 849, in forward
|
| 46 |
-
output = self._fsdp_wrapped_module(*args, **kwargs)
|
| 47 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 48 |
-
return self._call_impl(*args, **kwargs)
|
| 49 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 50 |
-
return forward_call(*args, **kwargs)
|
| 51 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 168, in forward
|
| 52 |
-
return self.checkpoint_fn( # type: ignore[misc]
|
| 53 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/_compile.py", line 24, in inner
|
| 54 |
-
return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
|
| 55 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
|
| 56 |
-
return fn(*args, **kwargs)
|
| 57 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
|
| 58 |
-
return fn(*args, **kwargs)
|
| 59 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint
|
| 60 |
-
ret = function(*args, **kwargs)
|
| 61 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 62 |
-
return self._call_impl(*args, **kwargs)
|
| 63 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 64 |
-
return forward_call(*args, **kwargs)
|
| 65 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 765, in forward
|
| 66 |
-
hidden_states = self.input_layernorm(hidden_states)
|
| 67 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|
| 68 |
-
return self._call_impl(*args, **kwargs)
|
| 69 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|
| 70 |
-
return forward_call(*args, **kwargs)
|
| 71 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 89, in forward
|
| 72 |
-
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
| 73 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 1094, in pack_hook
|
| 74 |
-
with torch.no_grad():
|
| 75 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 76, in __init__
|
| 76 |
-
super().__init__()
|
| 77 |
-
KeyboardInterrupt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/files/requirements.txt
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
nvidia-nvtx-cu12==12.1.105
|
| 2 |
-
kiwisolver==1.4.8
|
| 3 |
-
contourpy==1.3.2
|
| 4 |
-
nvidia-cudnn-cu12==8.9.2.26
|
| 5 |
-
tokenizers==0.19.1
|
| 6 |
-
nvidia-cuda-runtime-cu12==12.1.105
|
| 7 |
-
triton==2.2.0
|
| 8 |
-
hf-xet==1.1.7
|
| 9 |
-
mkl-service==2.4.0
|
| 10 |
-
mkl_random==1.2.8
|
| 11 |
-
pycparser==2.21
|
| 12 |
-
ml-dtypes==0.2.0
|
| 13 |
-
tensorflow==2.15.0
|
| 14 |
-
nvidia-cufft-cu12==11.0.2.54
|
| 15 |
-
pyasn1_modules==0.4.2
|
| 16 |
-
numpy==1.26.4
|
| 17 |
-
numpy==2.0.1
|
| 18 |
-
mypy_extensions==1.1.0
|
| 19 |
-
mkl_fft==1.3.11
|
| 20 |
-
mdurl==0.1.2
|
| 21 |
-
flash-attn==2.5.5
|
| 22 |
-
six==1.17.0
|
| 23 |
-
zipp==3.23.0
|
| 24 |
-
dlimp==0.0.1
|
| 25 |
-
json-numpy==2.1.1
|
| 26 |
-
PySocks==1.7.1
|
| 27 |
-
cffi==1.17.1
|
| 28 |
-
Werkzeug==3.1.3
|
| 29 |
-
rsa==4.9.1
|
| 30 |
-
packaging==25.0
|
| 31 |
-
draccus==0.8.0
|
| 32 |
-
typing-inspection==0.4.1
|
| 33 |
-
Markdown==3.8.2
|
| 34 |
-
wandb==0.21.1
|
| 35 |
-
trimesh==4.7.1
|
| 36 |
-
Pygments==2.19.2
|
| 37 |
-
pillow==11.3.0
|
| 38 |
-
libclang==18.1.1
|
| 39 |
-
typing-inspect==0.9.0
|
| 40 |
-
attrs==25.3.0
|
| 41 |
-
scipy==1.15.3
|
| 42 |
-
scipy==1.11.2
|
| 43 |
-
wrapt==1.14.1
|
| 44 |
-
safetensors==0.6.2
|
| 45 |
-
nvidia-curand-cu12==10.3.2.106
|
| 46 |
-
etils==1.13.0
|
| 47 |
-
OpenEXR==3.3.5
|
| 48 |
-
smmap==5.0.2
|
| 49 |
-
sentencepiece==0.1.99
|
| 50 |
-
pyparsing==3.2.3
|
| 51 |
-
astunparse==1.6.3
|
| 52 |
-
opt_einsum==3.4.0
|
| 53 |
-
tensorflow-graphics==2021.12.3
|
| 54 |
-
fsspec==2025.7.0
|
| 55 |
-
sympy==1.13.3
|
| 56 |
-
timm==0.9.10
|
| 57 |
-
pydantic==2.11.7
|
| 58 |
-
tensorboard==2.15.2
|
| 59 |
-
brotlicffi==1.0.9.2
|
| 60 |
-
torch==2.2.0
|
| 61 |
-
flatbuffers==25.2.10
|
| 62 |
-
filelock==3.17.0
|
| 63 |
-
click==8.2.1
|
| 64 |
-
nvidia-cuda-cupti-cu12==12.1.105
|
| 65 |
-
ninja==1.11.1.4
|
| 66 |
-
typeguard==2.13.3
|
| 67 |
-
nvidia-nccl-cu12==2.19.3
|
| 68 |
-
openvla==0.0.3
|
| 69 |
-
MarkupSafe==3.0.2
|
| 70 |
-
rich==14.1.0
|
| 71 |
-
nvidia-nvjitlink-cu12==12.9.86
|
| 72 |
-
tensorflow-datasets==4.9.3
|
| 73 |
-
tensorflow-io-gcs-filesystem==0.37.1
|
| 74 |
-
networkx==3.4.2
|
| 75 |
-
huggingface-hub==0.34.4
|
| 76 |
-
absl-py==2.3.1
|
| 77 |
-
nvidia-cublas-cu12==12.1.3.1
|
| 78 |
-
torchaudio==2.2.0
|
| 79 |
-
gmpy2==2.2.1
|
| 80 |
-
array_record==0.7.2
|
| 81 |
-
tensorflow-addons==0.23.0
|
| 82 |
-
oauthlib==3.3.1
|
| 83 |
-
PyYAML==6.0.2
|
| 84 |
-
regex==2025.7.34
|
| 85 |
-
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 86 |
-
setuptools==78.1.1
|
| 87 |
-
toml==0.10.2
|
| 88 |
-
google-auth==2.40.3
|
| 89 |
-
certifi==2025.8.3
|
| 90 |
-
keras==2.15.0
|
| 91 |
-
torchvision==0.17.0
|
| 92 |
-
grpcio==1.74.0
|
| 93 |
-
fonttools==4.59.0
|
| 94 |
-
transformers==4.40.1
|
| 95 |
-
annotated-types==0.7.0
|
| 96 |
-
charset-normalizer==3.3.2
|
| 97 |
-
promise==2.3
|
| 98 |
-
mergedeep==1.3.4
|
| 99 |
-
gast==0.6.0
|
| 100 |
-
cachetools==5.5.2
|
| 101 |
-
termcolor==3.1.0
|
| 102 |
-
pyyaml-include==1.4.1
|
| 103 |
-
importlib_resources==6.5.2
|
| 104 |
-
nvidia-cusolver-cu12==11.4.5.107
|
| 105 |
-
h5py==3.14.0
|
| 106 |
-
python-dateutil==2.9.0.post0
|
| 107 |
-
peft==0.11.1
|
| 108 |
-
urllib3==2.5.0
|
| 109 |
-
einops==0.8.1
|
| 110 |
-
tensorflow-estimator==2.15.0
|
| 111 |
-
requests==2.32.4
|
| 112 |
-
psutil==7.0.0
|
| 113 |
-
requests-oauthlib==2.0.0
|
| 114 |
-
pip==25.1
|
| 115 |
-
markdown-it-py==3.0.0
|
| 116 |
-
nvidia-cusparse-cu12==12.1.0.106
|
| 117 |
-
idna==3.7
|
| 118 |
-
tqdm==4.67.1
|
| 119 |
-
dm-tree==0.1.9
|
| 120 |
-
gitdb==4.0.12
|
| 121 |
-
typing_extensions==4.12.2
|
| 122 |
-
matplotlib==3.10.5
|
| 123 |
-
accelerate==1.10.0
|
| 124 |
-
tensorflow-metadata==1.17.2
|
| 125 |
-
sentry-sdk==2.34.1
|
| 126 |
-
jsonlines==4.0.0
|
| 127 |
-
protobuf==4.21.12
|
| 128 |
-
pyasn1==0.6.1
|
| 129 |
-
google-pasta==0.2.0
|
| 130 |
-
mpmath==1.3.0
|
| 131 |
-
Jinja2==3.1.6
|
| 132 |
-
tensorboard-data-server==0.7.2
|
| 133 |
-
pydantic_core==2.33.2
|
| 134 |
-
google-auth-oauthlib==1.2.2
|
| 135 |
-
cycler==0.12.1
|
| 136 |
-
platformdirs==4.3.8
|
| 137 |
-
GitPython==3.1.45
|
| 138 |
-
wheel==0.45.1
|
| 139 |
-
backports.tarfile==1.2.0
|
| 140 |
-
jaraco.collections==5.1.0
|
| 141 |
-
autocommand==2.2.2
|
| 142 |
-
typeguard==4.3.0
|
| 143 |
-
tomli==2.0.1
|
| 144 |
-
importlib_metadata==8.0.0
|
| 145 |
-
platformdirs==4.2.2
|
| 146 |
-
wheel==0.45.1
|
| 147 |
-
more-itertools==10.3.0
|
| 148 |
-
inflect==7.3.1
|
| 149 |
-
jaraco.context==5.3.0
|
| 150 |
-
typing_extensions==4.12.2
|
| 151 |
-
jaraco.functools==4.0.1
|
| 152 |
-
packaging==24.2
|
| 153 |
-
zipp==3.19.2
|
| 154 |
-
jaraco.text==3.12.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/files/wandb-metadata.json
DELETED
|
@@ -1,149 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"os": "Linux-6.8.0-1028-aws-x86_64-with-glibc2.35",
|
| 3 |
-
"python": "CPython 3.10.18",
|
| 4 |
-
"startedAt": "2025-08-09T00:09:14.729065Z",
|
| 5 |
-
"args": [
|
| 6 |
-
"--model.type",
|
| 7 |
-
"prism-qwen25-extra-dinosiglip-224px+3b",
|
| 8 |
-
"--model.finetune_per_device_batch_size",
|
| 9 |
-
"8"
|
| 10 |
-
],
|
| 11 |
-
"program": "/fsx/byungjun/openvla-mini/scripts/pretrain.py",
|
| 12 |
-
"codePath": "scripts/pretrain.py",
|
| 13 |
-
"codePathLocal": "scripts/pretrain.py",
|
| 14 |
-
"git": {
|
| 15 |
-
"remote": "https://github.com/happyhappy-jun/openvla-mini",
|
| 16 |
-
"commit": "0822b36227b5a771be4eb2680e34c559734c8fdc"
|
| 17 |
-
},
|
| 18 |
-
"email": "bjyoon513@gmail.com",
|
| 19 |
-
"root": "runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7",
|
| 20 |
-
"host": "compute-st-kait-gpu-2",
|
| 21 |
-
"executable": "/fsx/byungjun/miniconda3/envs/minivla/bin/python3.10",
|
| 22 |
-
"cpu_count": 96,
|
| 23 |
-
"cpu_count_logical": 192,
|
| 24 |
-
"gpu": "NVIDIA H200",
|
| 25 |
-
"gpu_count": 8,
|
| 26 |
-
"disk": {
|
| 27 |
-
"/": {
|
| 28 |
-
"total": "520120602624",
|
| 29 |
-
"used": "64347447296"
|
| 30 |
-
}
|
| 31 |
-
},
|
| 32 |
-
"memory": {
|
| 33 |
-
"total": "2147425312768"
|
| 34 |
-
},
|
| 35 |
-
"gpu_nvidia": [
|
| 36 |
-
{
|
| 37 |
-
"name": "NVIDIA H200",
|
| 38 |
-
"memoryTotal": "150754820096",
|
| 39 |
-
"cudaCores": 16896,
|
| 40 |
-
"architecture": "Hopper",
|
| 41 |
-
"uuid": "GPU-95044091-c6a6-4e9d-26a3-0249feeaf796"
|
| 42 |
-
},
|
| 43 |
-
{
|
| 44 |
-
"name": "NVIDIA H200",
|
| 45 |
-
"memoryTotal": "150754820096",
|
| 46 |
-
"cudaCores": 16896,
|
| 47 |
-
"architecture": "Hopper",
|
| 48 |
-
"uuid": "GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec"
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"name": "NVIDIA H200",
|
| 52 |
-
"memoryTotal": "150754820096",
|
| 53 |
-
"cudaCores": 16896,
|
| 54 |
-
"architecture": "Hopper",
|
| 55 |
-
"uuid": "GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645"
|
| 56 |
-
},
|
| 57 |
-
{
|
| 58 |
-
"name": "NVIDIA H200",
|
| 59 |
-
"memoryTotal": "150754820096",
|
| 60 |
-
"cudaCores": 16896,
|
| 61 |
-
"architecture": "Hopper",
|
| 62 |
-
"uuid": "GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da"
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"name": "NVIDIA H200",
|
| 66 |
-
"memoryTotal": "150754820096",
|
| 67 |
-
"cudaCores": 16896,
|
| 68 |
-
"architecture": "Hopper",
|
| 69 |
-
"uuid": "GPU-0245a021-19ca-991a-61b0-94cbc116d182"
|
| 70 |
-
},
|
| 71 |
-
{
|
| 72 |
-
"name": "NVIDIA H200",
|
| 73 |
-
"memoryTotal": "150754820096",
|
| 74 |
-
"cudaCores": 16896,
|
| 75 |
-
"architecture": "Hopper",
|
| 76 |
-
"uuid": "GPU-4213a83d-27d3-97d3-0cec-f9700637d48c"
|
| 77 |
-
},
|
| 78 |
-
{
|
| 79 |
-
"name": "NVIDIA H200",
|
| 80 |
-
"memoryTotal": "150754820096",
|
| 81 |
-
"cudaCores": 16896,
|
| 82 |
-
"architecture": "Hopper",
|
| 83 |
-
"uuid": "GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f"
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"name": "NVIDIA H200",
|
| 87 |
-
"memoryTotal": "150754820096",
|
| 88 |
-
"cudaCores": 16896,
|
| 89 |
-
"architecture": "Hopper",
|
| 90 |
-
"uuid": "GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7"
|
| 91 |
-
}
|
| 92 |
-
],
|
| 93 |
-
"cudaVersion": "12.8",
|
| 94 |
-
"slurm": {
|
| 95 |
-
"cluster_name": "kait-gpu-06-parallelcluster",
|
| 96 |
-
"conf": "/opt/slurm/etc/slurm.conf",
|
| 97 |
-
"cpu_bind": "quiet,mask_cpu:0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000",
|
| 98 |
-
"cpu_bind_list": "0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000",
|
| 99 |
-
"cpu_bind_type": "mask_cpu:",
|
| 100 |
-
"cpu_bind_verbose": "quiet",
|
| 101 |
-
"cpus_on_node": "128",
|
| 102 |
-
"gpus": "8",
|
| 103 |
-
"gpus_on_node": "8",
|
| 104 |
-
"gtids": "0",
|
| 105 |
-
"job_cpus_per_node": "128",
|
| 106 |
-
"job_end_time": "1786233065",
|
| 107 |
-
"job_gid": "1004",
|
| 108 |
-
"job_group": "byungjun",
|
| 109 |
-
"job_id": "527",
|
| 110 |
-
"job_name": "bash",
|
| 111 |
-
"job_nodelist": "compute-st-kait-gpu-2",
|
| 112 |
-
"job_num_nodes": "1",
|
| 113 |
-
"job_partition": "batch2",
|
| 114 |
-
"job_start_time": "1754697065",
|
| 115 |
-
"job_uid": "1004",
|
| 116 |
-
"job_user": "byungjun",
|
| 117 |
-
"jobid": "527",
|
| 118 |
-
"launch_node_ipaddr": "10.10.47.245",
|
| 119 |
-
"localid": "0",
|
| 120 |
-
"nnodes": "1",
|
| 121 |
-
"nodeid": "0",
|
| 122 |
-
"nodelist": "compute-st-kait-gpu-2",
|
| 123 |
-
"nprocs": "1",
|
| 124 |
-
"ntasks": "1",
|
| 125 |
-
"prio_process": "0",
|
| 126 |
-
"procid": "0",
|
| 127 |
-
"pty_port": "36537",
|
| 128 |
-
"pty_win_col": "362",
|
| 129 |
-
"pty_win_row": "84",
|
| 130 |
-
"srun_comm_host": "10.10.47.245",
|
| 131 |
-
"srun_comm_port": "45601",
|
| 132 |
-
"step_gpus": "0,1,2,3,4,5,6,7",
|
| 133 |
-
"step_id": "0",
|
| 134 |
-
"step_launcher_port": "45601",
|
| 135 |
-
"step_nodelist": "compute-st-kait-gpu-2",
|
| 136 |
-
"step_num_nodes": "1",
|
| 137 |
-
"step_num_tasks": "1",
|
| 138 |
-
"step_tasks_per_node": "1",
|
| 139 |
-
"stepid": "0",
|
| 140 |
-
"submit_dir": "/fsx/byungjun/openvla-mini",
|
| 141 |
-
"submit_host": "ip-10-10-47-245",
|
| 142 |
-
"task_pid": "299864",
|
| 143 |
-
"tasks_per_node": "1",
|
| 144 |
-
"topology_addr": "compute-st-kait-gpu-2",
|
| 145 |
-
"topology_addr_pattern": "node",
|
| 146 |
-
"umask": "0002"
|
| 147 |
-
},
|
| 148 |
-
"writerId": "3edib02cci947rvwid7aggk7s2h7y6r3"
|
| 149 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/files/wandb-summary.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"_wandb":{"runtime":236},"_runtime":236.322617231,"Finetune/Step":167,"Finetune/Loss":1.3435916900634766,"Finetune/Learning Rate":1.0739549839228296e-05,"_step":167,"Finetune/Step Time":1.3200225681066513,"Finetune/Loss (Raw)":1.0089986324310303,"_timestamp":1.7546983897202375e+09}
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/logs/debug-core.log
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T00:09:14.998998464Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmppz5qe2qs/port-310696.txt","pid":310696,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
-
{"time":"2025-08-09T00:09:15.000519302Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":310696}
|
| 3 |
-
{"time":"2025-08-09T00:09:15.000521507Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-310696-312789-804370854/socket","Net":"unix"}}
|
| 4 |
-
{"time":"2025-08-09T00:09:15.058183028Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
-
{"time":"2025-08-09T00:09:15.071139886Z","level":"INFO","msg":"handleInformInit: received","streamId":"ttc1ybny","id":"1(@)"}
|
| 6 |
-
{"time":"2025-08-09T00:09:15.519038972Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ttc1ybny","id":"1(@)"}
|
| 7 |
-
{"time":"2025-08-09T00:13:12.129122821Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
-
{"time":"2025-08-09T00:13:12.130510275Z","level":"INFO","msg":"server is shutting down"}
|
| 9 |
-
{"time":"2025-08-09T00:13:12.130613294Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-310696-312789-804370854/socket","Net":"unix"}}
|
| 10 |
-
{"time":"2025-08-09T00:13:12.130494733Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 11 |
-
{"time":"2025-08-09T00:13:12.130698195Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 12 |
-
{"time":"2025-08-09T00:13:12.944751942Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
-
{"time":"2025-08-09T00:13:12.944772087Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
-
{"time":"2025-08-09T00:13:12.944783918Z","level":"INFO","msg":"server is closed"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/logs/debug-internal.log
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T00:09:15.073794487Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"}
|
| 2 |
-
{"time":"2025-08-09T00:09:15.518973292Z","level":"INFO","msg":"stream: created new stream","id":"ttc1ybny"}
|
| 3 |
-
{"time":"2025-08-09T00:09:15.519031018Z","level":"INFO","msg":"stream: started","id":"ttc1ybny"}
|
| 4 |
-
{"time":"2025-08-09T00:09:15.519059671Z","level":"INFO","msg":"writer: started","stream_id":"ttc1ybny"}
|
| 5 |
-
{"time":"2025-08-09T00:09:15.519071384Z","level":"INFO","msg":"handler: started","stream_id":"ttc1ybny"}
|
| 6 |
-
{"time":"2025-08-09T00:09:15.519090603Z","level":"INFO","msg":"sender: started","stream_id":"ttc1ybny"}
|
| 7 |
-
{"time":"2025-08-09T00:13:12.130505394Z","level":"INFO","msg":"stream: closing","id":"ttc1ybny"}
|
| 8 |
-
{"time":"2025-08-09T00:13:12.667114203Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
-
{"time":"2025-08-09T00:13:12.930183192Z","level":"INFO","msg":"handler: closed","stream_id":"ttc1ybny"}
|
| 10 |
-
{"time":"2025-08-09T00:13:12.931064677Z","level":"INFO","msg":"sender: closed","stream_id":"ttc1ybny"}
|
| 11 |
-
{"time":"2025-08-09T00:13:12.931082577Z","level":"INFO","msg":"stream: closed","id":"ttc1ybny"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/logs/debug.log
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
2025-08-09 00:09:14,807 INFO MainThread:310696 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1
|
| 2 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_setup.py:_flush():80] Configure stats pid to 310696
|
| 3 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/.config/wandb/settings
|
| 4 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/openvla-mini/wandb/settings
|
| 5 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_init.py:setup_run_log_directory():703] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7/wandb/run-20250809_000914-ttc1ybny/logs/debug.log
|
| 7 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7/wandb/run-20250809_000914-ttc1ybny/logs/debug-internal.log
|
| 8 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_init.py:init():830] calling init triggers
|
| 9 |
-
2025-08-09 00:09:14,808 INFO MainThread:310696 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
|
| 10 |
-
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+3b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+3b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-3b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'image_sequence_len': 1, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 8, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': 'data2'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None, '_wandb': {}}
|
| 11 |
-
2025-08-09 00:09:14,809 INFO MainThread:310696 [wandb_init.py:init():871] starting backend
|
| 12 |
-
2025-08-09 00:09:15,058 INFO MainThread:310696 [wandb_init.py:init():874] sending inform_init request
|
| 13 |
-
2025-08-09 00:09:15,069 INFO MainThread:310696 [wandb_init.py:init():882] backend started and connected
|
| 14 |
-
2025-08-09 00:09:15,073 INFO MainThread:310696 [wandb_init.py:init():953] updated telemetry
|
| 15 |
-
2025-08-09 00:09:15,109 INFO MainThread:310696 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
|
| 16 |
-
2025-08-09 00:09:15,806 INFO MainThread:310696 [wandb_init.py:init():1029] starting run threads in backend
|
| 17 |
-
2025-08-09 00:09:16,420 INFO MainThread:310696 [wandb_run.py:_console_start():2494] atexit reg
|
| 18 |
-
2025-08-09 00:09:16,420 INFO MainThread:310696 [wandb_run.py:_redirect():2342] redirect: wrap_raw
|
| 19 |
-
2025-08-09 00:09:16,420 INFO MainThread:310696 [wandb_run.py:_redirect():2411] Wrapping output streams.
|
| 20 |
-
2025-08-09 00:09:16,420 INFO MainThread:310696 [wandb_run.py:_redirect():2434] Redirects installed.
|
| 21 |
-
2025-08-09 00:09:16,435 INFO MainThread:310696 [wandb_init.py:init():1075] run started, returning control to user process
|
| 22 |
-
2025-08-09 00:13:12,126 INFO MsgRouterThr:310696 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_000914-ttc1ybny/run-ttc1ybny.wandb
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d9f44546feb9e553b748a1f2a2066c2d29d824020c23458a2542027b9fbd9335
|
| 3 |
-
size 317821
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/files/config.yaml
DELETED
|
@@ -1,225 +0,0 @@
|
|
| 1 |
-
_wandb:
|
| 2 |
-
value:
|
| 3 |
-
cli_version: 0.21.1
|
| 4 |
-
e:
|
| 5 |
-
vllu22bcqlllmyuwxuzw6uvs6d6got8z:
|
| 6 |
-
args:
|
| 7 |
-
- --model.type
|
| 8 |
-
- prism-qwen25-extra-dinosiglip-224px+7b
|
| 9 |
-
- --model.finetune_per_device_batch_size
|
| 10 |
-
- "8"
|
| 11 |
-
codePath: scripts/pretrain.py
|
| 12 |
-
codePathLocal: scripts/pretrain.py
|
| 13 |
-
cpu_count: 96
|
| 14 |
-
cpu_count_logical: 192
|
| 15 |
-
cudaVersion: "12.8"
|
| 16 |
-
disk:
|
| 17 |
-
/:
|
| 18 |
-
total: "520120602624"
|
| 19 |
-
used: "64347758592"
|
| 20 |
-
email: bjyoon513@gmail.com
|
| 21 |
-
executable: /fsx/byungjun/miniconda3/envs/minivla/bin/python3.10
|
| 22 |
-
git:
|
| 23 |
-
commit: 1441372f4af4f91d0e99c9a104d10536d8ad566d
|
| 24 |
-
remote: https://github.com/happyhappy-jun/openvla-mini
|
| 25 |
-
gpu: NVIDIA H200
|
| 26 |
-
gpu_count: 8
|
| 27 |
-
gpu_nvidia:
|
| 28 |
-
- architecture: Hopper
|
| 29 |
-
cudaCores: 16896
|
| 30 |
-
memoryTotal: "150754820096"
|
| 31 |
-
name: NVIDIA H200
|
| 32 |
-
uuid: GPU-95044091-c6a6-4e9d-26a3-0249feeaf796
|
| 33 |
-
- architecture: Hopper
|
| 34 |
-
cudaCores: 16896
|
| 35 |
-
memoryTotal: "150754820096"
|
| 36 |
-
name: NVIDIA H200
|
| 37 |
-
uuid: GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec
|
| 38 |
-
- architecture: Hopper
|
| 39 |
-
cudaCores: 16896
|
| 40 |
-
memoryTotal: "150754820096"
|
| 41 |
-
name: NVIDIA H200
|
| 42 |
-
uuid: GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645
|
| 43 |
-
- architecture: Hopper
|
| 44 |
-
cudaCores: 16896
|
| 45 |
-
memoryTotal: "150754820096"
|
| 46 |
-
name: NVIDIA H200
|
| 47 |
-
uuid: GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da
|
| 48 |
-
- architecture: Hopper
|
| 49 |
-
cudaCores: 16896
|
| 50 |
-
memoryTotal: "150754820096"
|
| 51 |
-
name: NVIDIA H200
|
| 52 |
-
uuid: GPU-0245a021-19ca-991a-61b0-94cbc116d182
|
| 53 |
-
- architecture: Hopper
|
| 54 |
-
cudaCores: 16896
|
| 55 |
-
memoryTotal: "150754820096"
|
| 56 |
-
name: NVIDIA H200
|
| 57 |
-
uuid: GPU-4213a83d-27d3-97d3-0cec-f9700637d48c
|
| 58 |
-
- architecture: Hopper
|
| 59 |
-
cudaCores: 16896
|
| 60 |
-
memoryTotal: "150754820096"
|
| 61 |
-
name: NVIDIA H200
|
| 62 |
-
uuid: GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f
|
| 63 |
-
- architecture: Hopper
|
| 64 |
-
cudaCores: 16896
|
| 65 |
-
memoryTotal: "150754820096"
|
| 66 |
-
name: NVIDIA H200
|
| 67 |
-
uuid: GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7
|
| 68 |
-
host: compute-st-kait-gpu-2
|
| 69 |
-
memory:
|
| 70 |
-
total: "2147425312768"
|
| 71 |
-
os: Linux-6.8.0-1028-aws-x86_64-with-glibc2.35
|
| 72 |
-
program: /fsx/byungjun/openvla-mini/scripts/pretrain.py
|
| 73 |
-
python: CPython 3.10.18
|
| 74 |
-
root: runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7
|
| 75 |
-
slurm:
|
| 76 |
-
cluster_name: kait-gpu-06-parallelcluster
|
| 77 |
-
conf: /opt/slurm/etc/slurm.conf
|
| 78 |
-
cpu_bind: quiet,mask_cpu:0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000
|
| 79 |
-
cpu_bind_list: 0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000
|
| 80 |
-
cpu_bind_type: 'mask_cpu:'
|
| 81 |
-
cpu_bind_verbose: quiet
|
| 82 |
-
cpus_on_node: "128"
|
| 83 |
-
gpus: "8"
|
| 84 |
-
gpus_on_node: "8"
|
| 85 |
-
gtids: "0"
|
| 86 |
-
job_cpus_per_node: "128"
|
| 87 |
-
job_end_time: "1786233065"
|
| 88 |
-
job_gid: "1004"
|
| 89 |
-
job_group: byungjun
|
| 90 |
-
job_id: "527"
|
| 91 |
-
job_name: bash
|
| 92 |
-
job_nodelist: compute-st-kait-gpu-2
|
| 93 |
-
job_num_nodes: "1"
|
| 94 |
-
job_partition: batch2
|
| 95 |
-
job_start_time: "1754697065"
|
| 96 |
-
job_uid: "1004"
|
| 97 |
-
job_user: byungjun
|
| 98 |
-
jobid: "527"
|
| 99 |
-
launch_node_ipaddr: 10.10.47.245
|
| 100 |
-
localid: "0"
|
| 101 |
-
nnodes: "1"
|
| 102 |
-
nodeid: "0"
|
| 103 |
-
nodelist: compute-st-kait-gpu-2
|
| 104 |
-
nprocs: "1"
|
| 105 |
-
ntasks: "1"
|
| 106 |
-
prio_process: "0"
|
| 107 |
-
procid: "0"
|
| 108 |
-
pty_port: "36537"
|
| 109 |
-
pty_win_col: "362"
|
| 110 |
-
pty_win_row: "84"
|
| 111 |
-
srun_comm_host: 10.10.47.245
|
| 112 |
-
srun_comm_port: "45601"
|
| 113 |
-
step_gpus: 0,1,2,3,4,5,6,7
|
| 114 |
-
step_id: "0"
|
| 115 |
-
step_launcher_port: "45601"
|
| 116 |
-
step_nodelist: compute-st-kait-gpu-2
|
| 117 |
-
step_num_nodes: "1"
|
| 118 |
-
step_num_tasks: "1"
|
| 119 |
-
step_tasks_per_node: "1"
|
| 120 |
-
stepid: "0"
|
| 121 |
-
submit_dir: /fsx/byungjun/openvla-mini
|
| 122 |
-
submit_host: ip-10-10-47-245
|
| 123 |
-
task_pid: "299864"
|
| 124 |
-
tasks_per_node: "1"
|
| 125 |
-
topology_addr: compute-st-kait-gpu-2
|
| 126 |
-
topology_addr_pattern: node
|
| 127 |
-
umask: "0002"
|
| 128 |
-
startedAt: "2025-08-09T00:28:17.772013Z"
|
| 129 |
-
writerId: vllu22bcqlllmyuwxuzw6uvs6d6got8z
|
| 130 |
-
m: []
|
| 131 |
-
python_version: 3.10.18
|
| 132 |
-
t:
|
| 133 |
-
"1":
|
| 134 |
-
- 1
|
| 135 |
-
- 2
|
| 136 |
-
- 3
|
| 137 |
-
- 11
|
| 138 |
-
- 41
|
| 139 |
-
- 49
|
| 140 |
-
- 63
|
| 141 |
-
- 71
|
| 142 |
-
"2":
|
| 143 |
-
- 1
|
| 144 |
-
- 2
|
| 145 |
-
- 3
|
| 146 |
-
- 11
|
| 147 |
-
- 41
|
| 148 |
-
- 49
|
| 149 |
-
- 63
|
| 150 |
-
- 71
|
| 151 |
-
"3":
|
| 152 |
-
- 13
|
| 153 |
-
- 16
|
| 154 |
-
- 61
|
| 155 |
-
"4": 3.10.18
|
| 156 |
-
"5": 0.21.1
|
| 157 |
-
"6": 4.40.1
|
| 158 |
-
"12": 0.21.1
|
| 159 |
-
"13": linux-x86_64
|
| 160 |
-
dataset:
|
| 161 |
-
value:
|
| 162 |
-
align_stage_components:
|
| 163 |
-
- download/llava-laion-cc-sbu-558k/chat.json
|
| 164 |
-
- download/llava-laion-cc-sbu-558k
|
| 165 |
-
dataset_id: llava-v15
|
| 166 |
-
dataset_root_dir: data2
|
| 167 |
-
finetune_stage_components:
|
| 168 |
-
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
| 169 |
-
- download/llava-v1.5-instruct
|
| 170 |
-
type: llava-v15
|
| 171 |
-
hf_token:
|
| 172 |
-
value: .hf_token
|
| 173 |
-
model:
|
| 174 |
-
value:
|
| 175 |
-
align_epochs: 1
|
| 176 |
-
align_global_batch_size: 96
|
| 177 |
-
align_learning_rate: 0.001
|
| 178 |
-
align_lr_scheduler_type: linear-warmup+cosine-decay
|
| 179 |
-
align_max_grad_norm: 1
|
| 180 |
-
align_max_steps: null
|
| 181 |
-
align_per_device_batch_size: 16
|
| 182 |
-
align_save_every_n_steps: 10000
|
| 183 |
-
align_train_strategy: fsdp-shard-grad-op
|
| 184 |
-
align_warmup_ratio: 0.03
|
| 185 |
-
align_weight_decay: 0
|
| 186 |
-
arch_specifier: no-align+fused-gelu-mlp
|
| 187 |
-
enable_gradient_checkpointing: true
|
| 188 |
-
enable_mixed_precision_training: true
|
| 189 |
-
finetune_epochs: 2
|
| 190 |
-
finetune_global_batch_size: 128
|
| 191 |
-
finetune_learning_rate: 2e-05
|
| 192 |
-
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
| 193 |
-
finetune_max_grad_norm: 1
|
| 194 |
-
finetune_max_steps: null
|
| 195 |
-
finetune_per_device_batch_size: 8
|
| 196 |
-
finetune_save_every_n_steps: 10000
|
| 197 |
-
finetune_train_strategy: fsdp-full-shard
|
| 198 |
-
finetune_warmup_ratio: 0.03
|
| 199 |
-
finetune_weight_decay: 0.1
|
| 200 |
-
image_resize_strategy: resize-naive
|
| 201 |
-
image_sequence_len: 1
|
| 202 |
-
llm_backbone_id: qwen25-7b-extra
|
| 203 |
-
llm_max_length: 32768
|
| 204 |
-
model_id: prism-qwen25-extra-dinosiglip-224px+7b
|
| 205 |
-
reduce_in_full_precision: false
|
| 206 |
-
type: prism-qwen25-extra-dinosiglip-224px+7b
|
| 207 |
-
vision_backbone_id: dinosiglip-vit-so-224px
|
| 208 |
-
pretrained_checkpoint:
|
| 209 |
-
value: null
|
| 210 |
-
run_id:
|
| 211 |
-
value: prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7
|
| 212 |
-
run_root_dir:
|
| 213 |
-
value: runs
|
| 214 |
-
seed:
|
| 215 |
-
value: 7
|
| 216 |
-
stage:
|
| 217 |
-
value: finetune
|
| 218 |
-
trackers:
|
| 219 |
-
value:
|
| 220 |
-
- jsonl
|
| 221 |
-
- wandb
|
| 222 |
-
wandb_entity:
|
| 223 |
-
value: null
|
| 224 |
-
wandb_project:
|
| 225 |
-
value: prismatic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/files/output.log
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
[2;36m08/09 [00:28:19][0m[2;36m [0m[34mINFO [0m | >> [1m[[0m*[1m][0m Starting Training Loop ]8;id=176211;file:///fsx/byungjun/openvla-mini/scripts/pretrain.py\[2mpretrain.py[0m]8;;\[2m:[0m]8;id=640595;file:///fsx/byungjun/openvla-mini/scripts/pretrain.py#231\[2m231[0m]8;;\
|
| 2 |
-
Traceback (most recent call last):
|
| 3 |
-
File "/fsx/byungjun/openvla-mini/scripts/pretrain.py", line 245, in <module>
|
| 4 |
-
pretrain()
|
| 5 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/draccus/argparsing.py", line 203, in wrapper_inner
|
| 6 |
-
response = fn(cfg, *args, **kwargs)
|
| 7 |
-
File "/fsx/byungjun/openvla-mini/scripts/pretrain.py", line 232, in pretrain
|
| 8 |
-
train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
|
| 9 |
-
File "/fsx/byungjun/openvla-mini/prismatic/training/strategies/base_strategy.py", line 215, in run_training
|
| 10 |
-
normalized_loss.backward()
|
| 11 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/_tensor.py", line 522, in backward
|
| 12 |
-
torch.autograd.backward(
|
| 13 |
-
File "/fsx/byungjun/miniconda3/envs/minivla/lib/python3.10/site-packages/torch/autograd/__init__.py", line 266, in backward
|
| 14 |
-
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 15 |
-
KeyboardInterrupt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/files/requirements.txt
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
nvidia-nvtx-cu12==12.1.105
|
| 2 |
-
kiwisolver==1.4.8
|
| 3 |
-
contourpy==1.3.2
|
| 4 |
-
nvidia-cudnn-cu12==8.9.2.26
|
| 5 |
-
tokenizers==0.19.1
|
| 6 |
-
nvidia-cuda-runtime-cu12==12.1.105
|
| 7 |
-
triton==2.2.0
|
| 8 |
-
hf-xet==1.1.7
|
| 9 |
-
mkl-service==2.4.0
|
| 10 |
-
mkl_random==1.2.8
|
| 11 |
-
pycparser==2.21
|
| 12 |
-
ml-dtypes==0.2.0
|
| 13 |
-
tensorflow==2.15.0
|
| 14 |
-
nvidia-cufft-cu12==11.0.2.54
|
| 15 |
-
pyasn1_modules==0.4.2
|
| 16 |
-
numpy==1.26.4
|
| 17 |
-
numpy==2.0.1
|
| 18 |
-
mypy_extensions==1.1.0
|
| 19 |
-
mkl_fft==1.3.11
|
| 20 |
-
mdurl==0.1.2
|
| 21 |
-
flash-attn==2.5.5
|
| 22 |
-
six==1.17.0
|
| 23 |
-
zipp==3.23.0
|
| 24 |
-
dlimp==0.0.1
|
| 25 |
-
json-numpy==2.1.1
|
| 26 |
-
PySocks==1.7.1
|
| 27 |
-
cffi==1.17.1
|
| 28 |
-
Werkzeug==3.1.3
|
| 29 |
-
rsa==4.9.1
|
| 30 |
-
packaging==25.0
|
| 31 |
-
draccus==0.8.0
|
| 32 |
-
typing-inspection==0.4.1
|
| 33 |
-
Markdown==3.8.2
|
| 34 |
-
wandb==0.21.1
|
| 35 |
-
trimesh==4.7.1
|
| 36 |
-
Pygments==2.19.2
|
| 37 |
-
pillow==11.3.0
|
| 38 |
-
libclang==18.1.1
|
| 39 |
-
typing-inspect==0.9.0
|
| 40 |
-
attrs==25.3.0
|
| 41 |
-
scipy==1.15.3
|
| 42 |
-
scipy==1.11.2
|
| 43 |
-
wrapt==1.14.1
|
| 44 |
-
safetensors==0.6.2
|
| 45 |
-
nvidia-curand-cu12==10.3.2.106
|
| 46 |
-
etils==1.13.0
|
| 47 |
-
OpenEXR==3.3.5
|
| 48 |
-
smmap==5.0.2
|
| 49 |
-
sentencepiece==0.1.99
|
| 50 |
-
pyparsing==3.2.3
|
| 51 |
-
astunparse==1.6.3
|
| 52 |
-
opt_einsum==3.4.0
|
| 53 |
-
tensorflow-graphics==2021.12.3
|
| 54 |
-
fsspec==2025.7.0
|
| 55 |
-
sympy==1.13.3
|
| 56 |
-
timm==0.9.10
|
| 57 |
-
pydantic==2.11.7
|
| 58 |
-
tensorboard==2.15.2
|
| 59 |
-
brotlicffi==1.0.9.2
|
| 60 |
-
torch==2.2.0
|
| 61 |
-
flatbuffers==25.2.10
|
| 62 |
-
filelock==3.17.0
|
| 63 |
-
click==8.2.1
|
| 64 |
-
nvidia-cuda-cupti-cu12==12.1.105
|
| 65 |
-
ninja==1.11.1.4
|
| 66 |
-
typeguard==2.13.3
|
| 67 |
-
nvidia-nccl-cu12==2.19.3
|
| 68 |
-
openvla==0.0.3
|
| 69 |
-
MarkupSafe==3.0.2
|
| 70 |
-
rich==14.1.0
|
| 71 |
-
nvidia-nvjitlink-cu12==12.9.86
|
| 72 |
-
tensorflow-datasets==4.9.3
|
| 73 |
-
tensorflow-io-gcs-filesystem==0.37.1
|
| 74 |
-
networkx==3.4.2
|
| 75 |
-
huggingface-hub==0.34.4
|
| 76 |
-
absl-py==2.3.1
|
| 77 |
-
nvidia-cublas-cu12==12.1.3.1
|
| 78 |
-
torchaudio==2.2.0
|
| 79 |
-
gmpy2==2.2.1
|
| 80 |
-
array_record==0.7.2
|
| 81 |
-
tensorflow-addons==0.23.0
|
| 82 |
-
oauthlib==3.3.1
|
| 83 |
-
PyYAML==6.0.2
|
| 84 |
-
regex==2025.7.34
|
| 85 |
-
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 86 |
-
setuptools==78.1.1
|
| 87 |
-
toml==0.10.2
|
| 88 |
-
google-auth==2.40.3
|
| 89 |
-
certifi==2025.8.3
|
| 90 |
-
keras==2.15.0
|
| 91 |
-
torchvision==0.17.0
|
| 92 |
-
grpcio==1.74.0
|
| 93 |
-
fonttools==4.59.0
|
| 94 |
-
transformers==4.40.1
|
| 95 |
-
annotated-types==0.7.0
|
| 96 |
-
charset-normalizer==3.3.2
|
| 97 |
-
promise==2.3
|
| 98 |
-
mergedeep==1.3.4
|
| 99 |
-
gast==0.6.0
|
| 100 |
-
cachetools==5.5.2
|
| 101 |
-
termcolor==3.1.0
|
| 102 |
-
pyyaml-include==1.4.1
|
| 103 |
-
importlib_resources==6.5.2
|
| 104 |
-
nvidia-cusolver-cu12==11.4.5.107
|
| 105 |
-
h5py==3.14.0
|
| 106 |
-
python-dateutil==2.9.0.post0
|
| 107 |
-
peft==0.11.1
|
| 108 |
-
urllib3==2.5.0
|
| 109 |
-
einops==0.8.1
|
| 110 |
-
tensorflow-estimator==2.15.0
|
| 111 |
-
requests==2.32.4
|
| 112 |
-
psutil==7.0.0
|
| 113 |
-
requests-oauthlib==2.0.0
|
| 114 |
-
pip==25.1
|
| 115 |
-
markdown-it-py==3.0.0
|
| 116 |
-
nvidia-cusparse-cu12==12.1.0.106
|
| 117 |
-
idna==3.7
|
| 118 |
-
tqdm==4.67.1
|
| 119 |
-
dm-tree==0.1.9
|
| 120 |
-
gitdb==4.0.12
|
| 121 |
-
typing_extensions==4.12.2
|
| 122 |
-
matplotlib==3.10.5
|
| 123 |
-
accelerate==1.10.0
|
| 124 |
-
tensorflow-metadata==1.17.2
|
| 125 |
-
sentry-sdk==2.34.1
|
| 126 |
-
jsonlines==4.0.0
|
| 127 |
-
protobuf==4.21.12
|
| 128 |
-
pyasn1==0.6.1
|
| 129 |
-
google-pasta==0.2.0
|
| 130 |
-
mpmath==1.3.0
|
| 131 |
-
Jinja2==3.1.6
|
| 132 |
-
tensorboard-data-server==0.7.2
|
| 133 |
-
pydantic_core==2.33.2
|
| 134 |
-
google-auth-oauthlib==1.2.2
|
| 135 |
-
cycler==0.12.1
|
| 136 |
-
platformdirs==4.3.8
|
| 137 |
-
GitPython==3.1.45
|
| 138 |
-
wheel==0.45.1
|
| 139 |
-
backports.tarfile==1.2.0
|
| 140 |
-
jaraco.collections==5.1.0
|
| 141 |
-
autocommand==2.2.2
|
| 142 |
-
typeguard==4.3.0
|
| 143 |
-
tomli==2.0.1
|
| 144 |
-
importlib_metadata==8.0.0
|
| 145 |
-
platformdirs==4.2.2
|
| 146 |
-
wheel==0.45.1
|
| 147 |
-
more-itertools==10.3.0
|
| 148 |
-
inflect==7.3.1
|
| 149 |
-
jaraco.context==5.3.0
|
| 150 |
-
typing_extensions==4.12.2
|
| 151 |
-
jaraco.functools==4.0.1
|
| 152 |
-
packaging==24.2
|
| 153 |
-
zipp==3.19.2
|
| 154 |
-
jaraco.text==3.12.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/files/wandb-metadata.json
DELETED
|
@@ -1,149 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"os": "Linux-6.8.0-1028-aws-x86_64-with-glibc2.35",
|
| 3 |
-
"python": "CPython 3.10.18",
|
| 4 |
-
"startedAt": "2025-08-09T00:28:17.772013Z",
|
| 5 |
-
"args": [
|
| 6 |
-
"--model.type",
|
| 7 |
-
"prism-qwen25-extra-dinosiglip-224px+7b",
|
| 8 |
-
"--model.finetune_per_device_batch_size",
|
| 9 |
-
"8"
|
| 10 |
-
],
|
| 11 |
-
"program": "/fsx/byungjun/openvla-mini/scripts/pretrain.py",
|
| 12 |
-
"codePath": "scripts/pretrain.py",
|
| 13 |
-
"codePathLocal": "scripts/pretrain.py",
|
| 14 |
-
"git": {
|
| 15 |
-
"remote": "https://github.com/happyhappy-jun/openvla-mini",
|
| 16 |
-
"commit": "1441372f4af4f91d0e99c9a104d10536d8ad566d"
|
| 17 |
-
},
|
| 18 |
-
"email": "bjyoon513@gmail.com",
|
| 19 |
-
"root": "runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7",
|
| 20 |
-
"host": "compute-st-kait-gpu-2",
|
| 21 |
-
"executable": "/fsx/byungjun/miniconda3/envs/minivla/bin/python3.10",
|
| 22 |
-
"cpu_count": 96,
|
| 23 |
-
"cpu_count_logical": 192,
|
| 24 |
-
"gpu": "NVIDIA H200",
|
| 25 |
-
"gpu_count": 8,
|
| 26 |
-
"disk": {
|
| 27 |
-
"/": {
|
| 28 |
-
"total": "520120602624",
|
| 29 |
-
"used": "64347758592"
|
| 30 |
-
}
|
| 31 |
-
},
|
| 32 |
-
"memory": {
|
| 33 |
-
"total": "2147425312768"
|
| 34 |
-
},
|
| 35 |
-
"gpu_nvidia": [
|
| 36 |
-
{
|
| 37 |
-
"name": "NVIDIA H200",
|
| 38 |
-
"memoryTotal": "150754820096",
|
| 39 |
-
"cudaCores": 16896,
|
| 40 |
-
"architecture": "Hopper",
|
| 41 |
-
"uuid": "GPU-95044091-c6a6-4e9d-26a3-0249feeaf796"
|
| 42 |
-
},
|
| 43 |
-
{
|
| 44 |
-
"name": "NVIDIA H200",
|
| 45 |
-
"memoryTotal": "150754820096",
|
| 46 |
-
"cudaCores": 16896,
|
| 47 |
-
"architecture": "Hopper",
|
| 48 |
-
"uuid": "GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec"
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"name": "NVIDIA H200",
|
| 52 |
-
"memoryTotal": "150754820096",
|
| 53 |
-
"cudaCores": 16896,
|
| 54 |
-
"architecture": "Hopper",
|
| 55 |
-
"uuid": "GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645"
|
| 56 |
-
},
|
| 57 |
-
{
|
| 58 |
-
"name": "NVIDIA H200",
|
| 59 |
-
"memoryTotal": "150754820096",
|
| 60 |
-
"cudaCores": 16896,
|
| 61 |
-
"architecture": "Hopper",
|
| 62 |
-
"uuid": "GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da"
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"name": "NVIDIA H200",
|
| 66 |
-
"memoryTotal": "150754820096",
|
| 67 |
-
"cudaCores": 16896,
|
| 68 |
-
"architecture": "Hopper",
|
| 69 |
-
"uuid": "GPU-0245a021-19ca-991a-61b0-94cbc116d182"
|
| 70 |
-
},
|
| 71 |
-
{
|
| 72 |
-
"name": "NVIDIA H200",
|
| 73 |
-
"memoryTotal": "150754820096",
|
| 74 |
-
"cudaCores": 16896,
|
| 75 |
-
"architecture": "Hopper",
|
| 76 |
-
"uuid": "GPU-4213a83d-27d3-97d3-0cec-f9700637d48c"
|
| 77 |
-
},
|
| 78 |
-
{
|
| 79 |
-
"name": "NVIDIA H200",
|
| 80 |
-
"memoryTotal": "150754820096",
|
| 81 |
-
"cudaCores": 16896,
|
| 82 |
-
"architecture": "Hopper",
|
| 83 |
-
"uuid": "GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f"
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"name": "NVIDIA H200",
|
| 87 |
-
"memoryTotal": "150754820096",
|
| 88 |
-
"cudaCores": 16896,
|
| 89 |
-
"architecture": "Hopper",
|
| 90 |
-
"uuid": "GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7"
|
| 91 |
-
}
|
| 92 |
-
],
|
| 93 |
-
"cudaVersion": "12.8",
|
| 94 |
-
"slurm": {
|
| 95 |
-
"cluster_name": "kait-gpu-06-parallelcluster",
|
| 96 |
-
"conf": "/opt/slurm/etc/slurm.conf",
|
| 97 |
-
"cpu_bind": "quiet,mask_cpu:0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000",
|
| 98 |
-
"cpu_bind_list": "0xFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF00000000",
|
| 99 |
-
"cpu_bind_type": "mask_cpu:",
|
| 100 |
-
"cpu_bind_verbose": "quiet",
|
| 101 |
-
"cpus_on_node": "128",
|
| 102 |
-
"gpus": "8",
|
| 103 |
-
"gpus_on_node": "8",
|
| 104 |
-
"gtids": "0",
|
| 105 |
-
"job_cpus_per_node": "128",
|
| 106 |
-
"job_end_time": "1786233065",
|
| 107 |
-
"job_gid": "1004",
|
| 108 |
-
"job_group": "byungjun",
|
| 109 |
-
"job_id": "527",
|
| 110 |
-
"job_name": "bash",
|
| 111 |
-
"job_nodelist": "compute-st-kait-gpu-2",
|
| 112 |
-
"job_num_nodes": "1",
|
| 113 |
-
"job_partition": "batch2",
|
| 114 |
-
"job_start_time": "1754697065",
|
| 115 |
-
"job_uid": "1004",
|
| 116 |
-
"job_user": "byungjun",
|
| 117 |
-
"jobid": "527",
|
| 118 |
-
"launch_node_ipaddr": "10.10.47.245",
|
| 119 |
-
"localid": "0",
|
| 120 |
-
"nnodes": "1",
|
| 121 |
-
"nodeid": "0",
|
| 122 |
-
"nodelist": "compute-st-kait-gpu-2",
|
| 123 |
-
"nprocs": "1",
|
| 124 |
-
"ntasks": "1",
|
| 125 |
-
"prio_process": "0",
|
| 126 |
-
"procid": "0",
|
| 127 |
-
"pty_port": "36537",
|
| 128 |
-
"pty_win_col": "362",
|
| 129 |
-
"pty_win_row": "84",
|
| 130 |
-
"srun_comm_host": "10.10.47.245",
|
| 131 |
-
"srun_comm_port": "45601",
|
| 132 |
-
"step_gpus": "0,1,2,3,4,5,6,7",
|
| 133 |
-
"step_id": "0",
|
| 134 |
-
"step_launcher_port": "45601",
|
| 135 |
-
"step_nodelist": "compute-st-kait-gpu-2",
|
| 136 |
-
"step_num_nodes": "1",
|
| 137 |
-
"step_num_tasks": "1",
|
| 138 |
-
"step_tasks_per_node": "1",
|
| 139 |
-
"stepid": "0",
|
| 140 |
-
"submit_dir": "/fsx/byungjun/openvla-mini",
|
| 141 |
-
"submit_host": "ip-10-10-47-245",
|
| 142 |
-
"task_pid": "299864",
|
| 143 |
-
"tasks_per_node": "1",
|
| 144 |
-
"topology_addr": "compute-st-kait-gpu-2",
|
| 145 |
-
"topology_addr_pattern": "node",
|
| 146 |
-
"umask": "0002"
|
| 147 |
-
},
|
| 148 |
-
"writerId": "vllu22bcqlllmyuwxuzw6uvs6d6got8z"
|
| 149 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/files/wandb-summary.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"Finetune/Step Time":1.899375669658184,"_timestamp":1.7546996588573313e+09,"Finetune/Step":179,"_runtime":361.177352297,"_step":179,"Finetune/Loss":1.1296229362487793,"Finetune/Loss (Raw)":0.8316053152084351,"_wandb":{"runtime":361},"Finetune/Learning Rate":1.1511254019292605e-05}
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/logs/debug-core.log
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T00:28:18.119666003Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpyqnvs5s6/port-319874.txt","pid":319874,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
-
{"time":"2025-08-09T00:28:18.120962189Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":319874}
|
| 3 |
-
{"time":"2025-08-09T00:28:18.120959456Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-319874-326372-1479802229/socket","Net":"unix"}}
|
| 4 |
-
{"time":"2025-08-09T00:28:18.180753532Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
-
{"time":"2025-08-09T00:28:18.210476243Z","level":"INFO","msg":"handleInformInit: received","streamId":"g4nrjez0","id":"1(@)"}
|
| 6 |
-
{"time":"2025-08-09T00:28:18.565791506Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"g4nrjez0","id":"1(@)"}
|
| 7 |
-
{"time":"2025-08-09T00:34:20.084549801Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
-
{"time":"2025-08-09T00:34:20.086127609Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
-
{"time":"2025-08-09T00:34:20.08614172Z","level":"INFO","msg":"server is shutting down"}
|
| 10 |
-
{"time":"2025-08-09T00:34:20.086227672Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
-
{"time":"2025-08-09T00:34:20.08628093Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-319874-326372-1479802229/socket","Net":"unix"}}
|
| 12 |
-
{"time":"2025-08-09T00:34:20.962961869Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
-
{"time":"2025-08-09T00:34:20.962994307Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
-
{"time":"2025-08-09T00:34:20.96300497Z","level":"INFO","msg":"server is closed"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/logs/debug-internal.log
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T00:28:18.213209216Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"}
|
| 2 |
-
{"time":"2025-08-09T00:28:18.565722905Z","level":"INFO","msg":"stream: created new stream","id":"g4nrjez0"}
|
| 3 |
-
{"time":"2025-08-09T00:28:18.565783517Z","level":"INFO","msg":"stream: started","id":"g4nrjez0"}
|
| 4 |
-
{"time":"2025-08-09T00:28:18.565804516Z","level":"INFO","msg":"writer: started","stream_id":"g4nrjez0"}
|
| 5 |
-
{"time":"2025-08-09T00:28:18.56583991Z","level":"INFO","msg":"sender: started","stream_id":"g4nrjez0"}
|
| 6 |
-
{"time":"2025-08-09T00:28:18.565819921Z","level":"INFO","msg":"handler: started","stream_id":"g4nrjez0"}
|
| 7 |
-
{"time":"2025-08-09T00:34:20.086137698Z","level":"INFO","msg":"stream: closing","id":"g4nrjez0"}
|
| 8 |
-
{"time":"2025-08-09T00:34:20.688455806Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
-
{"time":"2025-08-09T00:34:20.949879505Z","level":"INFO","msg":"handler: closed","stream_id":"g4nrjez0"}
|
| 10 |
-
{"time":"2025-08-09T00:34:20.950937516Z","level":"INFO","msg":"sender: closed","stream_id":"g4nrjez0"}
|
| 11 |
-
{"time":"2025-08-09T00:34:20.950965207Z","level":"INFO","msg":"stream: closed","id":"g4nrjez0"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/logs/debug.log
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
2025-08-09 00:28:17,895 INFO MainThread:319874 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1
|
| 2 |
-
2025-08-09 00:28:17,897 INFO MainThread:319874 [wandb_setup.py:_flush():80] Configure stats pid to 319874
|
| 3 |
-
2025-08-09 00:28:17,897 INFO MainThread:319874 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/.config/wandb/settings
|
| 4 |
-
2025-08-09 00:28:17,897 INFO MainThread:319874 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/openvla-mini/wandb/settings
|
| 5 |
-
2025-08-09 00:28:17,897 INFO MainThread:319874 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
-
2025-08-09 00:28:17,897 INFO MainThread:319874 [wandb_init.py:setup_run_log_directory():703] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7/wandb/run-20250809_002817-g4nrjez0/logs/debug.log
|
| 7 |
-
2025-08-09 00:28:17,898 INFO MainThread:319874 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7/wandb/run-20250809_002817-g4nrjez0/logs/debug-internal.log
|
| 8 |
-
2025-08-09 00:28:17,898 INFO MainThread:319874 [wandb_init.py:init():830] calling init triggers
|
| 9 |
-
2025-08-09 00:28:17,898 INFO MainThread:319874 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
|
| 10 |
-
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+7b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+7b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-7b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'image_sequence_len': 1, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 8, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': 'data2'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None, '_wandb': {}}
|
| 11 |
-
2025-08-09 00:28:17,898 INFO MainThread:319874 [wandb_init.py:init():871] starting backend
|
| 12 |
-
2025-08-09 00:28:18,180 INFO MainThread:319874 [wandb_init.py:init():874] sending inform_init request
|
| 13 |
-
2025-08-09 00:28:18,209 INFO MainThread:319874 [wandb_init.py:init():882] backend started and connected
|
| 14 |
-
2025-08-09 00:28:18,211 INFO MainThread:319874 [wandb_init.py:init():953] updated telemetry
|
| 15 |
-
2025-08-09 00:28:18,280 INFO MainThread:319874 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
|
| 16 |
-
2025-08-09 00:28:18,900 INFO MainThread:319874 [wandb_init.py:init():1029] starting run threads in backend
|
| 17 |
-
2025-08-09 00:28:19,735 INFO MainThread:319874 [wandb_run.py:_console_start():2494] atexit reg
|
| 18 |
-
2025-08-09 00:28:19,735 INFO MainThread:319874 [wandb_run.py:_redirect():2342] redirect: wrap_raw
|
| 19 |
-
2025-08-09 00:28:19,735 INFO MainThread:319874 [wandb_run.py:_redirect():2411] Wrapping output streams.
|
| 20 |
-
2025-08-09 00:28:19,735 INFO MainThread:319874 [wandb_run.py:_redirect():2434] Redirects installed.
|
| 21 |
-
2025-08-09 00:28:19,755 INFO MainThread:319874 [wandb_init.py:init():1075] run started, returning control to user process
|
| 22 |
-
2025-08-09 00:34:20,082 INFO MsgRouterThr:319874 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_002817-g4nrjez0/run-g4nrjez0.wandb
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ba6a8ec227343c714732e0f716eccc6ab27ad1d25df82332a01b36eded991434
|
| 3 |
-
size 359681
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/files/config.yaml
DELETED
|
@@ -1,205 +0,0 @@
|
|
| 1 |
-
_wandb:
|
| 2 |
-
value:
|
| 3 |
-
cli_version: 0.21.1
|
| 4 |
-
e:
|
| 5 |
-
kmjhvvefrqpn82qctd036hqj5qg0u497:
|
| 6 |
-
args:
|
| 7 |
-
- --model.type
|
| 8 |
-
- prism-qwen25-extra-dinosiglip-224px+7b
|
| 9 |
-
- --model.finetune_per_device_batch_size
|
| 10 |
-
- "4"
|
| 11 |
-
codePath: scripts/pretrain.py
|
| 12 |
-
codePathLocal: scripts/pretrain.py
|
| 13 |
-
cpu_count: 96
|
| 14 |
-
cpu_count_logical: 192
|
| 15 |
-
cudaVersion: "12.8"
|
| 16 |
-
disk:
|
| 17 |
-
/:
|
| 18 |
-
total: "520120602624"
|
| 19 |
-
used: "64347889664"
|
| 20 |
-
email: bjyoon513@gmail.com
|
| 21 |
-
executable: /fsx/byungjun/miniconda3/envs/minivla/bin/python3.10
|
| 22 |
-
git:
|
| 23 |
-
commit: a72e9ce42035282eb6c950204e50ef3c4fbb363d
|
| 24 |
-
remote: https://github.com/happyhappy-jun/openvla-mini
|
| 25 |
-
gpu: NVIDIA H200
|
| 26 |
-
gpu_count: 8
|
| 27 |
-
gpu_nvidia:
|
| 28 |
-
- architecture: Hopper
|
| 29 |
-
cudaCores: 16896
|
| 30 |
-
memoryTotal: "150754820096"
|
| 31 |
-
name: NVIDIA H200
|
| 32 |
-
uuid: GPU-95044091-c6a6-4e9d-26a3-0249feeaf796
|
| 33 |
-
- architecture: Hopper
|
| 34 |
-
cudaCores: 16896
|
| 35 |
-
memoryTotal: "150754820096"
|
| 36 |
-
name: NVIDIA H200
|
| 37 |
-
uuid: GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec
|
| 38 |
-
- architecture: Hopper
|
| 39 |
-
cudaCores: 16896
|
| 40 |
-
memoryTotal: "150754820096"
|
| 41 |
-
name: NVIDIA H200
|
| 42 |
-
uuid: GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645
|
| 43 |
-
- architecture: Hopper
|
| 44 |
-
cudaCores: 16896
|
| 45 |
-
memoryTotal: "150754820096"
|
| 46 |
-
name: NVIDIA H200
|
| 47 |
-
uuid: GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da
|
| 48 |
-
- architecture: Hopper
|
| 49 |
-
cudaCores: 16896
|
| 50 |
-
memoryTotal: "150754820096"
|
| 51 |
-
name: NVIDIA H200
|
| 52 |
-
uuid: GPU-0245a021-19ca-991a-61b0-94cbc116d182
|
| 53 |
-
- architecture: Hopper
|
| 54 |
-
cudaCores: 16896
|
| 55 |
-
memoryTotal: "150754820096"
|
| 56 |
-
name: NVIDIA H200
|
| 57 |
-
uuid: GPU-4213a83d-27d3-97d3-0cec-f9700637d48c
|
| 58 |
-
- architecture: Hopper
|
| 59 |
-
cudaCores: 16896
|
| 60 |
-
memoryTotal: "150754820096"
|
| 61 |
-
name: NVIDIA H200
|
| 62 |
-
uuid: GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f
|
| 63 |
-
- architecture: Hopper
|
| 64 |
-
cudaCores: 16896
|
| 65 |
-
memoryTotal: "150754820096"
|
| 66 |
-
name: NVIDIA H200
|
| 67 |
-
uuid: GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7
|
| 68 |
-
host: compute-st-kait-gpu-2
|
| 69 |
-
memory:
|
| 70 |
-
total: "2147425312768"
|
| 71 |
-
os: Linux-6.8.0-1028-aws-x86_64-with-glibc2.35
|
| 72 |
-
program: /fsx/byungjun/openvla-mini/scripts/pretrain.py
|
| 73 |
-
python: CPython 3.10.18
|
| 74 |
-
root: runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7
|
| 75 |
-
slurm:
|
| 76 |
-
cluster_name: kait-gpu-06-parallelcluster
|
| 77 |
-
conf: /opt/slurm/etc/slurm.conf
|
| 78 |
-
cpus_on_node: "128"
|
| 79 |
-
gpus: "8"
|
| 80 |
-
gpus_on_node: "8"
|
| 81 |
-
gtids: "0"
|
| 82 |
-
job_cpus_per_node: "128"
|
| 83 |
-
job_end_time: "1754959041"
|
| 84 |
-
job_gid: "1004"
|
| 85 |
-
job_gpus: 0,1,2,3,4,5,6,7
|
| 86 |
-
job_id: "530"
|
| 87 |
-
job_name: qwen_3b_vlm_finetune
|
| 88 |
-
job_nodelist: compute-st-kait-gpu-2
|
| 89 |
-
job_num_nodes: "1"
|
| 90 |
-
job_partition: batch2
|
| 91 |
-
job_start_time: "1754699841"
|
| 92 |
-
job_uid: "1004"
|
| 93 |
-
job_user: byungjun
|
| 94 |
-
jobid: "530"
|
| 95 |
-
localid: "0"
|
| 96 |
-
nnodes: "1"
|
| 97 |
-
nodeid: "0"
|
| 98 |
-
nodelist: compute-st-kait-gpu-2
|
| 99 |
-
prio_process: "0"
|
| 100 |
-
procid: "0"
|
| 101 |
-
submit_dir: /fsx/byungjun/openvla-mini
|
| 102 |
-
submit_host: ip-10-10-47-245
|
| 103 |
-
task_pid: "332240"
|
| 104 |
-
tasks_per_node: "128"
|
| 105 |
-
topology_addr: compute-st-kait-gpu-2
|
| 106 |
-
topology_addr_pattern: node
|
| 107 |
-
startedAt: "2025-08-09T00:43:53.494358Z"
|
| 108 |
-
writerId: kmjhvvefrqpn82qctd036hqj5qg0u497
|
| 109 |
-
m: []
|
| 110 |
-
python_version: 3.10.18
|
| 111 |
-
t:
|
| 112 |
-
"1":
|
| 113 |
-
- 1
|
| 114 |
-
- 2
|
| 115 |
-
- 3
|
| 116 |
-
- 11
|
| 117 |
-
- 41
|
| 118 |
-
- 49
|
| 119 |
-
- 63
|
| 120 |
-
- 71
|
| 121 |
-
"2":
|
| 122 |
-
- 1
|
| 123 |
-
- 2
|
| 124 |
-
- 3
|
| 125 |
-
- 11
|
| 126 |
-
- 41
|
| 127 |
-
- 49
|
| 128 |
-
- 63
|
| 129 |
-
- 71
|
| 130 |
-
"3":
|
| 131 |
-
- 2
|
| 132 |
-
- 13
|
| 133 |
-
- 16
|
| 134 |
-
- 61
|
| 135 |
-
"4": 3.10.18
|
| 136 |
-
"5": 0.21.1
|
| 137 |
-
"6": 4.40.1
|
| 138 |
-
"12": 0.21.1
|
| 139 |
-
"13": linux-x86_64
|
| 140 |
-
dataset:
|
| 141 |
-
value:
|
| 142 |
-
align_stage_components:
|
| 143 |
-
- download/llava-laion-cc-sbu-558k/chat.json
|
| 144 |
-
- download/llava-laion-cc-sbu-558k
|
| 145 |
-
dataset_id: llava-v15
|
| 146 |
-
dataset_root_dir: data2
|
| 147 |
-
finetune_stage_components:
|
| 148 |
-
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
| 149 |
-
- download/llava-v1.5-instruct
|
| 150 |
-
type: llava-v15
|
| 151 |
-
hf_token:
|
| 152 |
-
value: .hf_token
|
| 153 |
-
model:
|
| 154 |
-
value:
|
| 155 |
-
align_epochs: 1
|
| 156 |
-
align_global_batch_size: 96
|
| 157 |
-
align_learning_rate: 0.001
|
| 158 |
-
align_lr_scheduler_type: linear-warmup+cosine-decay
|
| 159 |
-
align_max_grad_norm: 1
|
| 160 |
-
align_max_steps: null
|
| 161 |
-
align_per_device_batch_size: 16
|
| 162 |
-
align_save_every_n_steps: 10000
|
| 163 |
-
align_train_strategy: fsdp-shard-grad-op
|
| 164 |
-
align_warmup_ratio: 0.03
|
| 165 |
-
align_weight_decay: 0
|
| 166 |
-
arch_specifier: no-align+fused-gelu-mlp
|
| 167 |
-
enable_gradient_checkpointing: true
|
| 168 |
-
enable_mixed_precision_training: true
|
| 169 |
-
finetune_epochs: 2
|
| 170 |
-
finetune_global_batch_size: 128
|
| 171 |
-
finetune_learning_rate: 2e-05
|
| 172 |
-
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
| 173 |
-
finetune_max_grad_norm: 1
|
| 174 |
-
finetune_max_steps: null
|
| 175 |
-
finetune_per_device_batch_size: 4
|
| 176 |
-
finetune_save_every_n_steps: 10000
|
| 177 |
-
finetune_train_strategy: fsdp-full-shard
|
| 178 |
-
finetune_warmup_ratio: 0.03
|
| 179 |
-
finetune_weight_decay: 0.1
|
| 180 |
-
image_resize_strategy: resize-naive
|
| 181 |
-
image_sequence_len: 1
|
| 182 |
-
llm_backbone_id: qwen25-7b-extra
|
| 183 |
-
llm_max_length: 32768
|
| 184 |
-
model_id: prism-qwen25-extra-dinosiglip-224px+7b
|
| 185 |
-
reduce_in_full_precision: false
|
| 186 |
-
type: prism-qwen25-extra-dinosiglip-224px+7b
|
| 187 |
-
vision_backbone_id: dinosiglip-vit-so-224px
|
| 188 |
-
pretrained_checkpoint:
|
| 189 |
-
value: null
|
| 190 |
-
run_id:
|
| 191 |
-
value: prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7
|
| 192 |
-
run_root_dir:
|
| 193 |
-
value: runs
|
| 194 |
-
seed:
|
| 195 |
-
value: 7
|
| 196 |
-
stage:
|
| 197 |
-
value: finetune
|
| 198 |
-
trackers:
|
| 199 |
-
value:
|
| 200 |
-
- jsonl
|
| 201 |
-
- wandb
|
| 202 |
-
wandb_entity:
|
| 203 |
-
value: null
|
| 204 |
-
wandb_project:
|
| 205 |
-
value: prismatic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/files/output.log
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
08/09 [00:43:55] INFO | >> [*] Starting Training Loop pretrain.py:231
|
| 2 |
-
|
| 3 |
-
08/09 [07:37:35] INFO | >> [*] Done with Training =>> pretrain.py:235
|
| 4 |
-
Finalizing Metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/files/requirements.txt
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
nvidia-nvtx-cu12==12.1.105
|
| 2 |
-
kiwisolver==1.4.8
|
| 3 |
-
contourpy==1.3.2
|
| 4 |
-
nvidia-cudnn-cu12==8.9.2.26
|
| 5 |
-
tokenizers==0.19.1
|
| 6 |
-
nvidia-cuda-runtime-cu12==12.1.105
|
| 7 |
-
triton==2.2.0
|
| 8 |
-
hf-xet==1.1.7
|
| 9 |
-
mkl-service==2.4.0
|
| 10 |
-
mkl_random==1.2.8
|
| 11 |
-
pycparser==2.21
|
| 12 |
-
ml-dtypes==0.2.0
|
| 13 |
-
tensorflow==2.15.0
|
| 14 |
-
nvidia-cufft-cu12==11.0.2.54
|
| 15 |
-
pyasn1_modules==0.4.2
|
| 16 |
-
numpy==1.26.4
|
| 17 |
-
numpy==2.0.1
|
| 18 |
-
mypy_extensions==1.1.0
|
| 19 |
-
mkl_fft==1.3.11
|
| 20 |
-
mdurl==0.1.2
|
| 21 |
-
flash-attn==2.5.5
|
| 22 |
-
six==1.17.0
|
| 23 |
-
zipp==3.23.0
|
| 24 |
-
dlimp==0.0.1
|
| 25 |
-
json-numpy==2.1.1
|
| 26 |
-
PySocks==1.7.1
|
| 27 |
-
cffi==1.17.1
|
| 28 |
-
Werkzeug==3.1.3
|
| 29 |
-
rsa==4.9.1
|
| 30 |
-
packaging==25.0
|
| 31 |
-
draccus==0.8.0
|
| 32 |
-
typing-inspection==0.4.1
|
| 33 |
-
Markdown==3.8.2
|
| 34 |
-
wandb==0.21.1
|
| 35 |
-
trimesh==4.7.1
|
| 36 |
-
Pygments==2.19.2
|
| 37 |
-
pillow==11.3.0
|
| 38 |
-
libclang==18.1.1
|
| 39 |
-
typing-inspect==0.9.0
|
| 40 |
-
attrs==25.3.0
|
| 41 |
-
scipy==1.15.3
|
| 42 |
-
scipy==1.11.2
|
| 43 |
-
wrapt==1.14.1
|
| 44 |
-
safetensors==0.6.2
|
| 45 |
-
nvidia-curand-cu12==10.3.2.106
|
| 46 |
-
etils==1.13.0
|
| 47 |
-
OpenEXR==3.3.5
|
| 48 |
-
smmap==5.0.2
|
| 49 |
-
sentencepiece==0.1.99
|
| 50 |
-
pyparsing==3.2.3
|
| 51 |
-
astunparse==1.6.3
|
| 52 |
-
opt_einsum==3.4.0
|
| 53 |
-
tensorflow-graphics==2021.12.3
|
| 54 |
-
fsspec==2025.7.0
|
| 55 |
-
sympy==1.13.3
|
| 56 |
-
timm==0.9.10
|
| 57 |
-
pydantic==2.11.7
|
| 58 |
-
tensorboard==2.15.2
|
| 59 |
-
brotlicffi==1.0.9.2
|
| 60 |
-
torch==2.2.0
|
| 61 |
-
flatbuffers==25.2.10
|
| 62 |
-
filelock==3.17.0
|
| 63 |
-
click==8.2.1
|
| 64 |
-
nvidia-cuda-cupti-cu12==12.1.105
|
| 65 |
-
ninja==1.11.1.4
|
| 66 |
-
typeguard==2.13.3
|
| 67 |
-
nvidia-nccl-cu12==2.19.3
|
| 68 |
-
openvla==0.0.3
|
| 69 |
-
MarkupSafe==3.0.2
|
| 70 |
-
rich==14.1.0
|
| 71 |
-
nvidia-nvjitlink-cu12==12.9.86
|
| 72 |
-
tensorflow-datasets==4.9.3
|
| 73 |
-
tensorflow-io-gcs-filesystem==0.37.1
|
| 74 |
-
networkx==3.4.2
|
| 75 |
-
huggingface-hub==0.34.4
|
| 76 |
-
absl-py==2.3.1
|
| 77 |
-
nvidia-cublas-cu12==12.1.3.1
|
| 78 |
-
torchaudio==2.2.0
|
| 79 |
-
gmpy2==2.2.1
|
| 80 |
-
array_record==0.7.2
|
| 81 |
-
tensorflow-addons==0.23.0
|
| 82 |
-
oauthlib==3.3.1
|
| 83 |
-
PyYAML==6.0.2
|
| 84 |
-
regex==2025.7.34
|
| 85 |
-
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 86 |
-
setuptools==78.1.1
|
| 87 |
-
toml==0.10.2
|
| 88 |
-
google-auth==2.40.3
|
| 89 |
-
certifi==2025.8.3
|
| 90 |
-
keras==2.15.0
|
| 91 |
-
torchvision==0.17.0
|
| 92 |
-
grpcio==1.74.0
|
| 93 |
-
fonttools==4.59.0
|
| 94 |
-
transformers==4.40.1
|
| 95 |
-
annotated-types==0.7.0
|
| 96 |
-
charset-normalizer==3.3.2
|
| 97 |
-
promise==2.3
|
| 98 |
-
mergedeep==1.3.4
|
| 99 |
-
gast==0.6.0
|
| 100 |
-
cachetools==5.5.2
|
| 101 |
-
termcolor==3.1.0
|
| 102 |
-
pyyaml-include==1.4.1
|
| 103 |
-
importlib_resources==6.5.2
|
| 104 |
-
nvidia-cusolver-cu12==11.4.5.107
|
| 105 |
-
h5py==3.14.0
|
| 106 |
-
python-dateutil==2.9.0.post0
|
| 107 |
-
peft==0.11.1
|
| 108 |
-
urllib3==2.5.0
|
| 109 |
-
einops==0.8.1
|
| 110 |
-
tensorflow-estimator==2.15.0
|
| 111 |
-
requests==2.32.4
|
| 112 |
-
psutil==7.0.0
|
| 113 |
-
requests-oauthlib==2.0.0
|
| 114 |
-
pip==25.1
|
| 115 |
-
markdown-it-py==3.0.0
|
| 116 |
-
nvidia-cusparse-cu12==12.1.0.106
|
| 117 |
-
idna==3.7
|
| 118 |
-
tqdm==4.67.1
|
| 119 |
-
dm-tree==0.1.9
|
| 120 |
-
gitdb==4.0.12
|
| 121 |
-
typing_extensions==4.12.2
|
| 122 |
-
matplotlib==3.10.5
|
| 123 |
-
accelerate==1.10.0
|
| 124 |
-
tensorflow-metadata==1.17.2
|
| 125 |
-
sentry-sdk==2.34.1
|
| 126 |
-
jsonlines==4.0.0
|
| 127 |
-
protobuf==4.21.12
|
| 128 |
-
pyasn1==0.6.1
|
| 129 |
-
google-pasta==0.2.0
|
| 130 |
-
mpmath==1.3.0
|
| 131 |
-
Jinja2==3.1.6
|
| 132 |
-
tensorboard-data-server==0.7.2
|
| 133 |
-
pydantic_core==2.33.2
|
| 134 |
-
google-auth-oauthlib==1.2.2
|
| 135 |
-
cycler==0.12.1
|
| 136 |
-
platformdirs==4.3.8
|
| 137 |
-
GitPython==3.1.45
|
| 138 |
-
wheel==0.45.1
|
| 139 |
-
backports.tarfile==1.2.0
|
| 140 |
-
jaraco.collections==5.1.0
|
| 141 |
-
autocommand==2.2.2
|
| 142 |
-
typeguard==4.3.0
|
| 143 |
-
tomli==2.0.1
|
| 144 |
-
importlib_metadata==8.0.0
|
| 145 |
-
platformdirs==4.2.2
|
| 146 |
-
wheel==0.45.1
|
| 147 |
-
more-itertools==10.3.0
|
| 148 |
-
inflect==7.3.1
|
| 149 |
-
jaraco.context==5.3.0
|
| 150 |
-
typing_extensions==4.12.2
|
| 151 |
-
jaraco.functools==4.0.1
|
| 152 |
-
packaging==24.2
|
| 153 |
-
zipp==3.19.2
|
| 154 |
-
jaraco.text==3.12.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/files/wandb-metadata.json
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"os": "Linux-6.8.0-1028-aws-x86_64-with-glibc2.35",
|
| 3 |
-
"python": "CPython 3.10.18",
|
| 4 |
-
"startedAt": "2025-08-09T00:43:53.494358Z",
|
| 5 |
-
"args": [
|
| 6 |
-
"--model.type",
|
| 7 |
-
"prism-qwen25-extra-dinosiglip-224px+7b",
|
| 8 |
-
"--model.finetune_per_device_batch_size",
|
| 9 |
-
"4"
|
| 10 |
-
],
|
| 11 |
-
"program": "/fsx/byungjun/openvla-mini/scripts/pretrain.py",
|
| 12 |
-
"codePath": "scripts/pretrain.py",
|
| 13 |
-
"codePathLocal": "scripts/pretrain.py",
|
| 14 |
-
"git": {
|
| 15 |
-
"remote": "https://github.com/happyhappy-jun/openvla-mini",
|
| 16 |
-
"commit": "a72e9ce42035282eb6c950204e50ef3c4fbb363d"
|
| 17 |
-
},
|
| 18 |
-
"email": "bjyoon513@gmail.com",
|
| 19 |
-
"root": "runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7",
|
| 20 |
-
"host": "compute-st-kait-gpu-2",
|
| 21 |
-
"executable": "/fsx/byungjun/miniconda3/envs/minivla/bin/python3.10",
|
| 22 |
-
"cpu_count": 96,
|
| 23 |
-
"cpu_count_logical": 192,
|
| 24 |
-
"gpu": "NVIDIA H200",
|
| 25 |
-
"gpu_count": 8,
|
| 26 |
-
"disk": {
|
| 27 |
-
"/": {
|
| 28 |
-
"total": "520120602624",
|
| 29 |
-
"used": "64347889664"
|
| 30 |
-
}
|
| 31 |
-
},
|
| 32 |
-
"memory": {
|
| 33 |
-
"total": "2147425312768"
|
| 34 |
-
},
|
| 35 |
-
"gpu_nvidia": [
|
| 36 |
-
{
|
| 37 |
-
"name": "NVIDIA H200",
|
| 38 |
-
"memoryTotal": "150754820096",
|
| 39 |
-
"cudaCores": 16896,
|
| 40 |
-
"architecture": "Hopper",
|
| 41 |
-
"uuid": "GPU-95044091-c6a6-4e9d-26a3-0249feeaf796"
|
| 42 |
-
},
|
| 43 |
-
{
|
| 44 |
-
"name": "NVIDIA H200",
|
| 45 |
-
"memoryTotal": "150754820096",
|
| 46 |
-
"cudaCores": 16896,
|
| 47 |
-
"architecture": "Hopper",
|
| 48 |
-
"uuid": "GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec"
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"name": "NVIDIA H200",
|
| 52 |
-
"memoryTotal": "150754820096",
|
| 53 |
-
"cudaCores": 16896,
|
| 54 |
-
"architecture": "Hopper",
|
| 55 |
-
"uuid": "GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645"
|
| 56 |
-
},
|
| 57 |
-
{
|
| 58 |
-
"name": "NVIDIA H200",
|
| 59 |
-
"memoryTotal": "150754820096",
|
| 60 |
-
"cudaCores": 16896,
|
| 61 |
-
"architecture": "Hopper",
|
| 62 |
-
"uuid": "GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da"
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"name": "NVIDIA H200",
|
| 66 |
-
"memoryTotal": "150754820096",
|
| 67 |
-
"cudaCores": 16896,
|
| 68 |
-
"architecture": "Hopper",
|
| 69 |
-
"uuid": "GPU-0245a021-19ca-991a-61b0-94cbc116d182"
|
| 70 |
-
},
|
| 71 |
-
{
|
| 72 |
-
"name": "NVIDIA H200",
|
| 73 |
-
"memoryTotal": "150754820096",
|
| 74 |
-
"cudaCores": 16896,
|
| 75 |
-
"architecture": "Hopper",
|
| 76 |
-
"uuid": "GPU-4213a83d-27d3-97d3-0cec-f9700637d48c"
|
| 77 |
-
},
|
| 78 |
-
{
|
| 79 |
-
"name": "NVIDIA H200",
|
| 80 |
-
"memoryTotal": "150754820096",
|
| 81 |
-
"cudaCores": 16896,
|
| 82 |
-
"architecture": "Hopper",
|
| 83 |
-
"uuid": "GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f"
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"name": "NVIDIA H200",
|
| 87 |
-
"memoryTotal": "150754820096",
|
| 88 |
-
"cudaCores": 16896,
|
| 89 |
-
"architecture": "Hopper",
|
| 90 |
-
"uuid": "GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7"
|
| 91 |
-
}
|
| 92 |
-
],
|
| 93 |
-
"cudaVersion": "12.8",
|
| 94 |
-
"slurm": {
|
| 95 |
-
"cluster_name": "kait-gpu-06-parallelcluster",
|
| 96 |
-
"conf": "/opt/slurm/etc/slurm.conf",
|
| 97 |
-
"cpus_on_node": "128",
|
| 98 |
-
"gpus": "8",
|
| 99 |
-
"gpus_on_node": "8",
|
| 100 |
-
"gtids": "0",
|
| 101 |
-
"job_cpus_per_node": "128",
|
| 102 |
-
"job_end_time": "1754959041",
|
| 103 |
-
"job_gid": "1004",
|
| 104 |
-
"job_gpus": "0,1,2,3,4,5,6,7",
|
| 105 |
-
"job_id": "530",
|
| 106 |
-
"job_name": "qwen_3b_vlm_finetune",
|
| 107 |
-
"job_nodelist": "compute-st-kait-gpu-2",
|
| 108 |
-
"job_num_nodes": "1",
|
| 109 |
-
"job_partition": "batch2",
|
| 110 |
-
"job_start_time": "1754699841",
|
| 111 |
-
"job_uid": "1004",
|
| 112 |
-
"job_user": "byungjun",
|
| 113 |
-
"jobid": "530",
|
| 114 |
-
"localid": "0",
|
| 115 |
-
"nnodes": "1",
|
| 116 |
-
"nodeid": "0",
|
| 117 |
-
"nodelist": "compute-st-kait-gpu-2",
|
| 118 |
-
"prio_process": "0",
|
| 119 |
-
"procid": "0",
|
| 120 |
-
"submit_dir": "/fsx/byungjun/openvla-mini",
|
| 121 |
-
"submit_host": "ip-10-10-47-245",
|
| 122 |
-
"task_pid": "332240",
|
| 123 |
-
"tasks_per_node": "128",
|
| 124 |
-
"topology_addr": "compute-st-kait-gpu-2",
|
| 125 |
-
"topology_addr_pattern": "node"
|
| 126 |
-
},
|
| 127 |
-
"writerId": "kmjhvvefrqpn82qctd036hqj5qg0u497"
|
| 128 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/files/wandb-summary.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"_wandb":{"runtime":24820},"Finetune/Loss":0.4964504539966583,"Finetune/Step Time":2.4623548705130816,"Finetune/Loss (Raw)":0.4348399043083191,"Finetune/Learning Rate":0,"_runtime":24820.710890925,"_step":10396,"_timestamp":1.7547249823812644e+09,"Finetune/Step":10396}
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/logs/debug-core.log
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T00:43:53.81705927Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpci34j8hb/port-332325.txt","pid":332325,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
-
{"time":"2025-08-09T00:43:53.818633166Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":332325}
|
| 3 |
-
{"time":"2025-08-09T00:43:53.818616932Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-332325-335017-2636856031/socket","Net":"unix"}}
|
| 4 |
-
{"time":"2025-08-09T00:43:53.878374806Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
-
{"time":"2025-08-09T00:43:53.89799455Z","level":"INFO","msg":"handleInformInit: received","streamId":"jppa1ary","id":"1(@)"}
|
| 6 |
-
{"time":"2025-08-09T00:43:54.352237644Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"jppa1ary","id":"1(@)"}
|
| 7 |
-
{"time":"2025-08-09T07:37:36.50501082Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"jppa1ary","id":"1(@)"}
|
| 8 |
-
{"time":"2025-08-09T07:37:36.600528305Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"jppa1ary","id":"1(@)"}
|
| 9 |
-
{"time":"2025-08-09T07:41:09.482254737Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 10 |
-
{"time":"2025-08-09T07:41:09.482358562Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 11 |
-
{"time":"2025-08-09T07:41:09.482371713Z","level":"INFO","msg":"server is shutting down"}
|
| 12 |
-
{"time":"2025-08-09T07:41:09.482390424Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 13 |
-
{"time":"2025-08-09T07:41:09.482459713Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-332325-335017-2636856031/socket","Net":"unix"}}
|
| 14 |
-
{"time":"2025-08-09T07:41:09.482507448Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 15 |
-
{"time":"2025-08-09T07:41:09.482515631Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 16 |
-
{"time":"2025-08-09T07:41:09.482523879Z","level":"INFO","msg":"server is closed"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/logs/debug-internal.log
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T00:43:53.899512181Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"}
|
| 2 |
-
{"time":"2025-08-09T00:43:54.352151834Z","level":"INFO","msg":"stream: created new stream","id":"jppa1ary"}
|
| 3 |
-
{"time":"2025-08-09T00:43:54.352229223Z","level":"INFO","msg":"stream: started","id":"jppa1ary"}
|
| 4 |
-
{"time":"2025-08-09T00:43:54.352242749Z","level":"INFO","msg":"handler: started","stream_id":"jppa1ary"}
|
| 5 |
-
{"time":"2025-08-09T00:43:54.352276271Z","level":"INFO","msg":"writer: started","stream_id":"jppa1ary"}
|
| 6 |
-
{"time":"2025-08-09T00:43:54.352258529Z","level":"INFO","msg":"sender: started","stream_id":"jppa1ary"}
|
| 7 |
-
{"time":"2025-08-09T07:37:36.250440803Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 8 |
-
{"time":"2025-08-09T07:37:36.399127669Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading history steps 10395-10395, summary, console lines 1-3","runtime_seconds":0.146154819}],"total_operations":1}}
|
| 9 |
-
{"time":"2025-08-09T07:37:36.508572754Z","level":"INFO","msg":"stream: closing","id":"jppa1ary"}
|
| 10 |
-
{"time":"2025-08-09T07:37:36.508598422Z","level":"INFO","msg":"handler: closed","stream_id":"jppa1ary"}
|
| 11 |
-
{"time":"2025-08-09T07:37:36.509838915Z","level":"INFO","msg":"sender: closed","stream_id":"jppa1ary"}
|
| 12 |
-
{"time":"2025-08-09T07:37:36.509849291Z","level":"INFO","msg":"stream: closed","id":"jppa1ary"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/logs/debug.log
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
2025-08-09 00:43:53,600 INFO MainThread:332325 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1
|
| 2 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Configure stats pid to 332325
|
| 3 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/.config/wandb/settings
|
| 4 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/openvla-mini/wandb/settings
|
| 5 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
-
2025-08-09 00:43:53,601 INFO MainThread:332325 [wandb_init.py:setup_run_log_directory():703] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7/wandb/run-20250809_004353-jppa1ary/logs/debug.log
|
| 7 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7/wandb/run-20250809_004353-jppa1ary/logs/debug-internal.log
|
| 8 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:init():830] calling init triggers
|
| 9 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
|
| 10 |
-
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+7b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+7b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-7b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'image_sequence_len': 1, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': 'data2'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+7b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None, '_wandb': {}}
|
| 11 |
-
2025-08-09 00:43:53,602 INFO MainThread:332325 [wandb_init.py:init():871] starting backend
|
| 12 |
-
2025-08-09 00:43:53,878 INFO MainThread:332325 [wandb_init.py:init():874] sending inform_init request
|
| 13 |
-
2025-08-09 00:43:53,896 INFO MainThread:332325 [wandb_init.py:init():882] backend started and connected
|
| 14 |
-
2025-08-09 00:43:53,901 INFO MainThread:332325 [wandb_init.py:init():953] updated telemetry
|
| 15 |
-
2025-08-09 00:43:53,957 INFO MainThread:332325 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
|
| 16 |
-
2025-08-09 00:43:54,674 INFO MainThread:332325 [wandb_init.py:init():1029] starting run threads in backend
|
| 17 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_console_start():2494] atexit reg
|
| 18 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_redirect():2342] redirect: wrap_raw
|
| 19 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_redirect():2411] Wrapping output streams.
|
| 20 |
-
2025-08-09 00:43:55,488 INFO MainThread:332325 [wandb_run.py:_redirect():2434] Redirects installed.
|
| 21 |
-
2025-08-09 00:43:55,504 INFO MainThread:332325 [wandb_init.py:init():1075] run started, returning control to user process
|
| 22 |
-
2025-08-09 07:37:35,382 INFO MainThread:332325 [wandb_run.py:_finish():2260] finishing run happyhappy/prismatic/jppa1ary
|
| 23 |
-
2025-08-09 07:37:35,385 INFO MainThread:332325 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0
|
| 24 |
-
2025-08-09 07:37:35,387 INFO MainThread:332325 [wandb_run.py:_restore():2441] restore
|
| 25 |
-
2025-08-09 07:37:35,387 INFO MainThread:332325 [wandb_run.py:_restore():2447] restore done
|
| 26 |
-
2025-08-09 07:37:36,501 INFO MainThread:332325 [wandb_run.py:_footer_history_summary_info():3895] rendering history
|
| 27 |
-
2025-08-09 07:37:36,502 INFO MainThread:332325 [wandb_run.py:_footer_history_summary_info():3927] rendering summary
|
| 28 |
-
2025-08-09 07:37:36,502 INFO MainThread:332325 [wandb_run.py:_footer_sync_info():3856] logging synced files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_004353-jppa1ary/run-jppa1ary.wandb
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:98046782432a70779157e88c06e57847061e088c070e5e4db878dc0527989278
|
| 3 |
-
size 16589936
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/files/config.yaml
DELETED
|
@@ -1,205 +0,0 @@
|
|
| 1 |
-
_wandb:
|
| 2 |
-
value:
|
| 3 |
-
cli_version: 0.21.1
|
| 4 |
-
e:
|
| 5 |
-
3taj0yyhnh9dhglq0763fsp13ob8f2ee:
|
| 6 |
-
args:
|
| 7 |
-
- --model.type
|
| 8 |
-
- prism-qwen25-extra-dinosiglip-224px+3b
|
| 9 |
-
- --model.finetune_per_device_batch_size
|
| 10 |
-
- "4"
|
| 11 |
-
codePath: scripts/pretrain.py
|
| 12 |
-
codePathLocal: scripts/pretrain.py
|
| 13 |
-
cpu_count: 96
|
| 14 |
-
cpu_count_logical: 192
|
| 15 |
-
cudaVersion: "12.8"
|
| 16 |
-
disk:
|
| 17 |
-
/:
|
| 18 |
-
total: "520120602624"
|
| 19 |
-
used: "64348778496"
|
| 20 |
-
email: bjyoon513@gmail.com
|
| 21 |
-
executable: /fsx/byungjun/miniconda3/envs/minivla/bin/python3.10
|
| 22 |
-
git:
|
| 23 |
-
commit: a72e9ce42035282eb6c950204e50ef3c4fbb363d
|
| 24 |
-
remote: https://github.com/happyhappy-jun/openvla-mini
|
| 25 |
-
gpu: NVIDIA H200
|
| 26 |
-
gpu_count: 8
|
| 27 |
-
gpu_nvidia:
|
| 28 |
-
- architecture: Hopper
|
| 29 |
-
cudaCores: 16896
|
| 30 |
-
memoryTotal: "150754820096"
|
| 31 |
-
name: NVIDIA H200
|
| 32 |
-
uuid: GPU-95044091-c6a6-4e9d-26a3-0249feeaf796
|
| 33 |
-
- architecture: Hopper
|
| 34 |
-
cudaCores: 16896
|
| 35 |
-
memoryTotal: "150754820096"
|
| 36 |
-
name: NVIDIA H200
|
| 37 |
-
uuid: GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec
|
| 38 |
-
- architecture: Hopper
|
| 39 |
-
cudaCores: 16896
|
| 40 |
-
memoryTotal: "150754820096"
|
| 41 |
-
name: NVIDIA H200
|
| 42 |
-
uuid: GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645
|
| 43 |
-
- architecture: Hopper
|
| 44 |
-
cudaCores: 16896
|
| 45 |
-
memoryTotal: "150754820096"
|
| 46 |
-
name: NVIDIA H200
|
| 47 |
-
uuid: GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da
|
| 48 |
-
- architecture: Hopper
|
| 49 |
-
cudaCores: 16896
|
| 50 |
-
memoryTotal: "150754820096"
|
| 51 |
-
name: NVIDIA H200
|
| 52 |
-
uuid: GPU-0245a021-19ca-991a-61b0-94cbc116d182
|
| 53 |
-
- architecture: Hopper
|
| 54 |
-
cudaCores: 16896
|
| 55 |
-
memoryTotal: "150754820096"
|
| 56 |
-
name: NVIDIA H200
|
| 57 |
-
uuid: GPU-4213a83d-27d3-97d3-0cec-f9700637d48c
|
| 58 |
-
- architecture: Hopper
|
| 59 |
-
cudaCores: 16896
|
| 60 |
-
memoryTotal: "150754820096"
|
| 61 |
-
name: NVIDIA H200
|
| 62 |
-
uuid: GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f
|
| 63 |
-
- architecture: Hopper
|
| 64 |
-
cudaCores: 16896
|
| 65 |
-
memoryTotal: "150754820096"
|
| 66 |
-
name: NVIDIA H200
|
| 67 |
-
uuid: GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7
|
| 68 |
-
host: compute-st-kait-gpu-2
|
| 69 |
-
memory:
|
| 70 |
-
total: "2147425312768"
|
| 71 |
-
os: Linux-6.8.0-1028-aws-x86_64-with-glibc2.35
|
| 72 |
-
program: /fsx/byungjun/openvla-mini/scripts/pretrain.py
|
| 73 |
-
python: CPython 3.10.18
|
| 74 |
-
root: runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7
|
| 75 |
-
slurm:
|
| 76 |
-
cluster_name: kait-gpu-06-parallelcluster
|
| 77 |
-
conf: /opt/slurm/etc/slurm.conf
|
| 78 |
-
cpus_on_node: "128"
|
| 79 |
-
gpus: "8"
|
| 80 |
-
gpus_on_node: "8"
|
| 81 |
-
gtids: "0"
|
| 82 |
-
job_cpus_per_node: "128"
|
| 83 |
-
job_end_time: "1754984478"
|
| 84 |
-
job_gid: "1004"
|
| 85 |
-
job_gpus: 0,1,2,3,4,5,6,7
|
| 86 |
-
job_id: "531"
|
| 87 |
-
job_name: qwen_3b_vlm_finetune
|
| 88 |
-
job_nodelist: compute-st-kait-gpu-2
|
| 89 |
-
job_num_nodes: "1"
|
| 90 |
-
job_partition: batch2
|
| 91 |
-
job_start_time: "1754725278"
|
| 92 |
-
job_uid: "1004"
|
| 93 |
-
job_user: byungjun
|
| 94 |
-
jobid: "531"
|
| 95 |
-
localid: "0"
|
| 96 |
-
nnodes: "1"
|
| 97 |
-
nodeid: "0"
|
| 98 |
-
nodelist: compute-st-kait-gpu-2
|
| 99 |
-
prio_process: "0"
|
| 100 |
-
procid: "0"
|
| 101 |
-
submit_dir: /fsx/byungjun/openvla-mini
|
| 102 |
-
submit_host: ip-10-10-47-245
|
| 103 |
-
task_pid: "511629"
|
| 104 |
-
tasks_per_node: "128"
|
| 105 |
-
topology_addr: compute-st-kait-gpu-2
|
| 106 |
-
topology_addr_pattern: node
|
| 107 |
-
startedAt: "2025-08-09T07:46:02.753034Z"
|
| 108 |
-
writerId: 3taj0yyhnh9dhglq0763fsp13ob8f2ee
|
| 109 |
-
m: []
|
| 110 |
-
python_version: 3.10.18
|
| 111 |
-
t:
|
| 112 |
-
"1":
|
| 113 |
-
- 1
|
| 114 |
-
- 2
|
| 115 |
-
- 3
|
| 116 |
-
- 11
|
| 117 |
-
- 41
|
| 118 |
-
- 49
|
| 119 |
-
- 63
|
| 120 |
-
- 71
|
| 121 |
-
"2":
|
| 122 |
-
- 1
|
| 123 |
-
- 2
|
| 124 |
-
- 3
|
| 125 |
-
- 11
|
| 126 |
-
- 41
|
| 127 |
-
- 49
|
| 128 |
-
- 63
|
| 129 |
-
- 71
|
| 130 |
-
"3":
|
| 131 |
-
- 2
|
| 132 |
-
- 13
|
| 133 |
-
- 16
|
| 134 |
-
- 61
|
| 135 |
-
"4": 3.10.18
|
| 136 |
-
"5": 0.21.1
|
| 137 |
-
"6": 4.40.1
|
| 138 |
-
"12": 0.21.1
|
| 139 |
-
"13": linux-x86_64
|
| 140 |
-
dataset:
|
| 141 |
-
value:
|
| 142 |
-
align_stage_components:
|
| 143 |
-
- download/llava-laion-cc-sbu-558k/chat.json
|
| 144 |
-
- download/llava-laion-cc-sbu-558k
|
| 145 |
-
dataset_id: llava-v15
|
| 146 |
-
dataset_root_dir: data2
|
| 147 |
-
finetune_stage_components:
|
| 148 |
-
- download/llava-v1.5-instruct/llava_v1_5_mix665k.json
|
| 149 |
-
- download/llava-v1.5-instruct
|
| 150 |
-
type: llava-v15
|
| 151 |
-
hf_token:
|
| 152 |
-
value: .hf_token
|
| 153 |
-
model:
|
| 154 |
-
value:
|
| 155 |
-
align_epochs: 1
|
| 156 |
-
align_global_batch_size: 96
|
| 157 |
-
align_learning_rate: 0.001
|
| 158 |
-
align_lr_scheduler_type: linear-warmup+cosine-decay
|
| 159 |
-
align_max_grad_norm: 1
|
| 160 |
-
align_max_steps: null
|
| 161 |
-
align_per_device_batch_size: 16
|
| 162 |
-
align_save_every_n_steps: 10000
|
| 163 |
-
align_train_strategy: fsdp-shard-grad-op
|
| 164 |
-
align_warmup_ratio: 0.03
|
| 165 |
-
align_weight_decay: 0
|
| 166 |
-
arch_specifier: no-align+fused-gelu-mlp
|
| 167 |
-
enable_gradient_checkpointing: true
|
| 168 |
-
enable_mixed_precision_training: true
|
| 169 |
-
finetune_epochs: 2
|
| 170 |
-
finetune_global_batch_size: 128
|
| 171 |
-
finetune_learning_rate: 2e-05
|
| 172 |
-
finetune_lr_scheduler_type: linear-warmup+cosine-decay
|
| 173 |
-
finetune_max_grad_norm: 1
|
| 174 |
-
finetune_max_steps: null
|
| 175 |
-
finetune_per_device_batch_size: 4
|
| 176 |
-
finetune_save_every_n_steps: 10000
|
| 177 |
-
finetune_train_strategy: fsdp-full-shard
|
| 178 |
-
finetune_warmup_ratio: 0.03
|
| 179 |
-
finetune_weight_decay: 0.1
|
| 180 |
-
image_resize_strategy: resize-naive
|
| 181 |
-
image_sequence_len: 1
|
| 182 |
-
llm_backbone_id: qwen25-3b-extra
|
| 183 |
-
llm_max_length: 32768
|
| 184 |
-
model_id: prism-qwen25-extra-dinosiglip-224px+3b
|
| 185 |
-
reduce_in_full_precision: false
|
| 186 |
-
type: prism-qwen25-extra-dinosiglip-224px+3b
|
| 187 |
-
vision_backbone_id: dinosiglip-vit-so-224px
|
| 188 |
-
pretrained_checkpoint:
|
| 189 |
-
value: null
|
| 190 |
-
run_id:
|
| 191 |
-
value: prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7
|
| 192 |
-
run_root_dir:
|
| 193 |
-
value: runs
|
| 194 |
-
seed:
|
| 195 |
-
value: 7
|
| 196 |
-
stage:
|
| 197 |
-
value: finetune
|
| 198 |
-
trackers:
|
| 199 |
-
value:
|
| 200 |
-
- jsonl
|
| 201 |
-
- wandb
|
| 202 |
-
wandb_entity:
|
| 203 |
-
value: null
|
| 204 |
-
wandb_project:
|
| 205 |
-
value: prismatic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/files/output.log
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
08/09 [07:46:04] INFO | >> [*] Starting Training Loop pretrain.py:231
|
| 2 |
-
|
| 3 |
-
08/09 [14:11:41] INFO | >> [*] Done with Training =>> pretrain.py:235
|
| 4 |
-
Finalizing Metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/files/requirements.txt
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
nvidia-nvtx-cu12==12.1.105
|
| 2 |
-
kiwisolver==1.4.8
|
| 3 |
-
contourpy==1.3.2
|
| 4 |
-
nvidia-cudnn-cu12==8.9.2.26
|
| 5 |
-
tokenizers==0.19.1
|
| 6 |
-
nvidia-cuda-runtime-cu12==12.1.105
|
| 7 |
-
triton==2.2.0
|
| 8 |
-
hf-xet==1.1.7
|
| 9 |
-
mkl-service==2.4.0
|
| 10 |
-
mkl_random==1.2.8
|
| 11 |
-
pycparser==2.21
|
| 12 |
-
ml-dtypes==0.2.0
|
| 13 |
-
tensorflow==2.15.0
|
| 14 |
-
nvidia-cufft-cu12==11.0.2.54
|
| 15 |
-
pyasn1_modules==0.4.2
|
| 16 |
-
numpy==1.26.4
|
| 17 |
-
numpy==2.0.1
|
| 18 |
-
mypy_extensions==1.1.0
|
| 19 |
-
mkl_fft==1.3.11
|
| 20 |
-
mdurl==0.1.2
|
| 21 |
-
flash-attn==2.5.5
|
| 22 |
-
six==1.17.0
|
| 23 |
-
zipp==3.23.0
|
| 24 |
-
dlimp==0.0.1
|
| 25 |
-
json-numpy==2.1.1
|
| 26 |
-
PySocks==1.7.1
|
| 27 |
-
cffi==1.17.1
|
| 28 |
-
Werkzeug==3.1.3
|
| 29 |
-
rsa==4.9.1
|
| 30 |
-
packaging==25.0
|
| 31 |
-
draccus==0.8.0
|
| 32 |
-
typing-inspection==0.4.1
|
| 33 |
-
Markdown==3.8.2
|
| 34 |
-
wandb==0.21.1
|
| 35 |
-
trimesh==4.7.1
|
| 36 |
-
Pygments==2.19.2
|
| 37 |
-
pillow==11.3.0
|
| 38 |
-
libclang==18.1.1
|
| 39 |
-
typing-inspect==0.9.0
|
| 40 |
-
attrs==25.3.0
|
| 41 |
-
scipy==1.15.3
|
| 42 |
-
scipy==1.11.2
|
| 43 |
-
wrapt==1.14.1
|
| 44 |
-
safetensors==0.6.2
|
| 45 |
-
nvidia-curand-cu12==10.3.2.106
|
| 46 |
-
etils==1.13.0
|
| 47 |
-
OpenEXR==3.3.5
|
| 48 |
-
smmap==5.0.2
|
| 49 |
-
sentencepiece==0.1.99
|
| 50 |
-
pyparsing==3.2.3
|
| 51 |
-
astunparse==1.6.3
|
| 52 |
-
opt_einsum==3.4.0
|
| 53 |
-
tensorflow-graphics==2021.12.3
|
| 54 |
-
fsspec==2025.7.0
|
| 55 |
-
sympy==1.13.3
|
| 56 |
-
timm==0.9.10
|
| 57 |
-
pydantic==2.11.7
|
| 58 |
-
tensorboard==2.15.2
|
| 59 |
-
brotlicffi==1.0.9.2
|
| 60 |
-
torch==2.2.0
|
| 61 |
-
flatbuffers==25.2.10
|
| 62 |
-
filelock==3.17.0
|
| 63 |
-
click==8.2.1
|
| 64 |
-
nvidia-cuda-cupti-cu12==12.1.105
|
| 65 |
-
ninja==1.11.1.4
|
| 66 |
-
typeguard==2.13.3
|
| 67 |
-
nvidia-nccl-cu12==2.19.3
|
| 68 |
-
openvla==0.0.3
|
| 69 |
-
MarkupSafe==3.0.2
|
| 70 |
-
rich==14.1.0
|
| 71 |
-
nvidia-nvjitlink-cu12==12.9.86
|
| 72 |
-
tensorflow-datasets==4.9.3
|
| 73 |
-
tensorflow-io-gcs-filesystem==0.37.1
|
| 74 |
-
networkx==3.4.2
|
| 75 |
-
huggingface-hub==0.34.4
|
| 76 |
-
absl-py==2.3.1
|
| 77 |
-
nvidia-cublas-cu12==12.1.3.1
|
| 78 |
-
torchaudio==2.2.0
|
| 79 |
-
gmpy2==2.2.1
|
| 80 |
-
array_record==0.7.2
|
| 81 |
-
tensorflow-addons==0.23.0
|
| 82 |
-
oauthlib==3.3.1
|
| 83 |
-
PyYAML==6.0.2
|
| 84 |
-
regex==2025.7.34
|
| 85 |
-
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 86 |
-
setuptools==78.1.1
|
| 87 |
-
toml==0.10.2
|
| 88 |
-
google-auth==2.40.3
|
| 89 |
-
certifi==2025.8.3
|
| 90 |
-
keras==2.15.0
|
| 91 |
-
torchvision==0.17.0
|
| 92 |
-
grpcio==1.74.0
|
| 93 |
-
fonttools==4.59.0
|
| 94 |
-
transformers==4.40.1
|
| 95 |
-
annotated-types==0.7.0
|
| 96 |
-
charset-normalizer==3.3.2
|
| 97 |
-
promise==2.3
|
| 98 |
-
mergedeep==1.3.4
|
| 99 |
-
gast==0.6.0
|
| 100 |
-
cachetools==5.5.2
|
| 101 |
-
termcolor==3.1.0
|
| 102 |
-
pyyaml-include==1.4.1
|
| 103 |
-
importlib_resources==6.5.2
|
| 104 |
-
nvidia-cusolver-cu12==11.4.5.107
|
| 105 |
-
h5py==3.14.0
|
| 106 |
-
python-dateutil==2.9.0.post0
|
| 107 |
-
peft==0.11.1
|
| 108 |
-
urllib3==2.5.0
|
| 109 |
-
einops==0.8.1
|
| 110 |
-
tensorflow-estimator==2.15.0
|
| 111 |
-
requests==2.32.4
|
| 112 |
-
psutil==7.0.0
|
| 113 |
-
requests-oauthlib==2.0.0
|
| 114 |
-
pip==25.1
|
| 115 |
-
markdown-it-py==3.0.0
|
| 116 |
-
nvidia-cusparse-cu12==12.1.0.106
|
| 117 |
-
idna==3.7
|
| 118 |
-
tqdm==4.67.1
|
| 119 |
-
dm-tree==0.1.9
|
| 120 |
-
gitdb==4.0.12
|
| 121 |
-
typing_extensions==4.12.2
|
| 122 |
-
matplotlib==3.10.5
|
| 123 |
-
accelerate==1.10.0
|
| 124 |
-
tensorflow-metadata==1.17.2
|
| 125 |
-
sentry-sdk==2.34.1
|
| 126 |
-
jsonlines==4.0.0
|
| 127 |
-
protobuf==4.21.12
|
| 128 |
-
pyasn1==0.6.1
|
| 129 |
-
google-pasta==0.2.0
|
| 130 |
-
mpmath==1.3.0
|
| 131 |
-
Jinja2==3.1.6
|
| 132 |
-
tensorboard-data-server==0.7.2
|
| 133 |
-
pydantic_core==2.33.2
|
| 134 |
-
google-auth-oauthlib==1.2.2
|
| 135 |
-
cycler==0.12.1
|
| 136 |
-
platformdirs==4.3.8
|
| 137 |
-
GitPython==3.1.45
|
| 138 |
-
wheel==0.45.1
|
| 139 |
-
backports.tarfile==1.2.0
|
| 140 |
-
jaraco.collections==5.1.0
|
| 141 |
-
autocommand==2.2.2
|
| 142 |
-
typeguard==4.3.0
|
| 143 |
-
tomli==2.0.1
|
| 144 |
-
importlib_metadata==8.0.0
|
| 145 |
-
platformdirs==4.2.2
|
| 146 |
-
wheel==0.45.1
|
| 147 |
-
more-itertools==10.3.0
|
| 148 |
-
inflect==7.3.1
|
| 149 |
-
jaraco.context==5.3.0
|
| 150 |
-
typing_extensions==4.12.2
|
| 151 |
-
jaraco.functools==4.0.1
|
| 152 |
-
packaging==24.2
|
| 153 |
-
zipp==3.19.2
|
| 154 |
-
jaraco.text==3.12.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/files/wandb-metadata.json
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"os": "Linux-6.8.0-1028-aws-x86_64-with-glibc2.35",
|
| 3 |
-
"python": "CPython 3.10.18",
|
| 4 |
-
"startedAt": "2025-08-09T07:46:02.753034Z",
|
| 5 |
-
"args": [
|
| 6 |
-
"--model.type",
|
| 7 |
-
"prism-qwen25-extra-dinosiglip-224px+3b",
|
| 8 |
-
"--model.finetune_per_device_batch_size",
|
| 9 |
-
"4"
|
| 10 |
-
],
|
| 11 |
-
"program": "/fsx/byungjun/openvla-mini/scripts/pretrain.py",
|
| 12 |
-
"codePath": "scripts/pretrain.py",
|
| 13 |
-
"codePathLocal": "scripts/pretrain.py",
|
| 14 |
-
"git": {
|
| 15 |
-
"remote": "https://github.com/happyhappy-jun/openvla-mini",
|
| 16 |
-
"commit": "a72e9ce42035282eb6c950204e50ef3c4fbb363d"
|
| 17 |
-
},
|
| 18 |
-
"email": "bjyoon513@gmail.com",
|
| 19 |
-
"root": "runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7",
|
| 20 |
-
"host": "compute-st-kait-gpu-2",
|
| 21 |
-
"executable": "/fsx/byungjun/miniconda3/envs/minivla/bin/python3.10",
|
| 22 |
-
"cpu_count": 96,
|
| 23 |
-
"cpu_count_logical": 192,
|
| 24 |
-
"gpu": "NVIDIA H200",
|
| 25 |
-
"gpu_count": 8,
|
| 26 |
-
"disk": {
|
| 27 |
-
"/": {
|
| 28 |
-
"total": "520120602624",
|
| 29 |
-
"used": "64348778496"
|
| 30 |
-
}
|
| 31 |
-
},
|
| 32 |
-
"memory": {
|
| 33 |
-
"total": "2147425312768"
|
| 34 |
-
},
|
| 35 |
-
"gpu_nvidia": [
|
| 36 |
-
{
|
| 37 |
-
"name": "NVIDIA H200",
|
| 38 |
-
"memoryTotal": "150754820096",
|
| 39 |
-
"cudaCores": 16896,
|
| 40 |
-
"architecture": "Hopper",
|
| 41 |
-
"uuid": "GPU-95044091-c6a6-4e9d-26a3-0249feeaf796"
|
| 42 |
-
},
|
| 43 |
-
{
|
| 44 |
-
"name": "NVIDIA H200",
|
| 45 |
-
"memoryTotal": "150754820096",
|
| 46 |
-
"cudaCores": 16896,
|
| 47 |
-
"architecture": "Hopper",
|
| 48 |
-
"uuid": "GPU-e54a8b43-5dd9-a2f8-8a71-b254a12248ec"
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"name": "NVIDIA H200",
|
| 52 |
-
"memoryTotal": "150754820096",
|
| 53 |
-
"cudaCores": 16896,
|
| 54 |
-
"architecture": "Hopper",
|
| 55 |
-
"uuid": "GPU-daed9c7c-6f35-ec0c-abd5-49e1f7d48645"
|
| 56 |
-
},
|
| 57 |
-
{
|
| 58 |
-
"name": "NVIDIA H200",
|
| 59 |
-
"memoryTotal": "150754820096",
|
| 60 |
-
"cudaCores": 16896,
|
| 61 |
-
"architecture": "Hopper",
|
| 62 |
-
"uuid": "GPU-acf2a7ee-d8a1-bb8c-be49-c7a07c0f07da"
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"name": "NVIDIA H200",
|
| 66 |
-
"memoryTotal": "150754820096",
|
| 67 |
-
"cudaCores": 16896,
|
| 68 |
-
"architecture": "Hopper",
|
| 69 |
-
"uuid": "GPU-0245a021-19ca-991a-61b0-94cbc116d182"
|
| 70 |
-
},
|
| 71 |
-
{
|
| 72 |
-
"name": "NVIDIA H200",
|
| 73 |
-
"memoryTotal": "150754820096",
|
| 74 |
-
"cudaCores": 16896,
|
| 75 |
-
"architecture": "Hopper",
|
| 76 |
-
"uuid": "GPU-4213a83d-27d3-97d3-0cec-f9700637d48c"
|
| 77 |
-
},
|
| 78 |
-
{
|
| 79 |
-
"name": "NVIDIA H200",
|
| 80 |
-
"memoryTotal": "150754820096",
|
| 81 |
-
"cudaCores": 16896,
|
| 82 |
-
"architecture": "Hopper",
|
| 83 |
-
"uuid": "GPU-8be9f5c6-a214-8b33-0ac2-217892edfa6f"
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"name": "NVIDIA H200",
|
| 87 |
-
"memoryTotal": "150754820096",
|
| 88 |
-
"cudaCores": 16896,
|
| 89 |
-
"architecture": "Hopper",
|
| 90 |
-
"uuid": "GPU-5c5fce07-faf7-1345-d5ea-4c13e75769e7"
|
| 91 |
-
}
|
| 92 |
-
],
|
| 93 |
-
"cudaVersion": "12.8",
|
| 94 |
-
"slurm": {
|
| 95 |
-
"cluster_name": "kait-gpu-06-parallelcluster",
|
| 96 |
-
"conf": "/opt/slurm/etc/slurm.conf",
|
| 97 |
-
"cpus_on_node": "128",
|
| 98 |
-
"gpus": "8",
|
| 99 |
-
"gpus_on_node": "8",
|
| 100 |
-
"gtids": "0",
|
| 101 |
-
"job_cpus_per_node": "128",
|
| 102 |
-
"job_end_time": "1754984478",
|
| 103 |
-
"job_gid": "1004",
|
| 104 |
-
"job_gpus": "0,1,2,3,4,5,6,7",
|
| 105 |
-
"job_id": "531",
|
| 106 |
-
"job_name": "qwen_3b_vlm_finetune",
|
| 107 |
-
"job_nodelist": "compute-st-kait-gpu-2",
|
| 108 |
-
"job_num_nodes": "1",
|
| 109 |
-
"job_partition": "batch2",
|
| 110 |
-
"job_start_time": "1754725278",
|
| 111 |
-
"job_uid": "1004",
|
| 112 |
-
"job_user": "byungjun",
|
| 113 |
-
"jobid": "531",
|
| 114 |
-
"localid": "0",
|
| 115 |
-
"nnodes": "1",
|
| 116 |
-
"nodeid": "0",
|
| 117 |
-
"nodelist": "compute-st-kait-gpu-2",
|
| 118 |
-
"prio_process": "0",
|
| 119 |
-
"procid": "0",
|
| 120 |
-
"submit_dir": "/fsx/byungjun/openvla-mini",
|
| 121 |
-
"submit_host": "ip-10-10-47-245",
|
| 122 |
-
"task_pid": "511629",
|
| 123 |
-
"tasks_per_node": "128",
|
| 124 |
-
"topology_addr": "compute-st-kait-gpu-2",
|
| 125 |
-
"topology_addr_pattern": "node"
|
| 126 |
-
},
|
| 127 |
-
"writerId": "3taj0yyhnh9dhglq0763fsp13ob8f2ee"
|
| 128 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/files/wandb-summary.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"Finetune/Step":10396,"Finetune/Loss":0.5836188793182373,"_wandb":{"runtime":23137},"_runtime":23137.460341213,"_step":10396,"Finetune/Learning Rate":0,"_timestamp":1.7547486698740156e+09,"Finetune/Step Time":2.2515620701014996,"Finetune/Loss (Raw)":0.6342841386795044}
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/logs/debug-core.log
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T07:46:03.108885616Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmprujohv9o/port-511740.txt","pid":511740,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
-
{"time":"2025-08-09T07:46:03.110958048Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":511740}
|
| 3 |
-
{"time":"2025-08-09T07:46:03.110958202Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-511740-513590-3014293601/socket","Net":"unix"}}
|
| 4 |
-
{"time":"2025-08-09T07:46:03.170319779Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
-
{"time":"2025-08-09T07:46:03.190240086Z","level":"INFO","msg":"handleInformInit: received","streamId":"gpyuprau","id":"1(@)"}
|
| 6 |
-
{"time":"2025-08-09T07:46:03.636533053Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"gpyuprau","id":"1(@)"}
|
| 7 |
-
{"time":"2025-08-09T14:11:42.468517268Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"gpyuprau","id":"1(@)"}
|
| 8 |
-
{"time":"2025-08-09T14:11:42.587479793Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"gpyuprau","id":"1(@)"}
|
| 9 |
-
{"time":"2025-08-09T14:15:15.093597184Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 10 |
-
{"time":"2025-08-09T14:15:15.093715872Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 11 |
-
{"time":"2025-08-09T14:15:15.093729541Z","level":"INFO","msg":"server is shutting down"}
|
| 12 |
-
{"time":"2025-08-09T14:15:15.093767704Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 13 |
-
{"time":"2025-08-09T14:15:15.093829956Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-511740-513590-3014293601/socket","Net":"unix"}}
|
| 14 |
-
{"time":"2025-08-09T14:15:15.09390158Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 15 |
-
{"time":"2025-08-09T14:15:15.093983904Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 16 |
-
{"time":"2025-08-09T14:15:15.094005344Z","level":"INFO","msg":"server is closed"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/logs/debug-internal.log
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{"time":"2025-08-09T07:46:03.192885763Z","level":"INFO","msg":"stream: starting","core version":"0.21.1"}
|
| 2 |
-
{"time":"2025-08-09T07:46:03.636468673Z","level":"INFO","msg":"stream: created new stream","id":"gpyuprau"}
|
| 3 |
-
{"time":"2025-08-09T07:46:03.636525114Z","level":"INFO","msg":"stream: started","id":"gpyuprau"}
|
| 4 |
-
{"time":"2025-08-09T07:46:03.636549213Z","level":"INFO","msg":"writer: started","stream_id":"gpyuprau"}
|
| 5 |
-
{"time":"2025-08-09T07:46:03.636570228Z","level":"INFO","msg":"sender: started","stream_id":"gpyuprau"}
|
| 6 |
-
{"time":"2025-08-09T07:46:03.636595816Z","level":"INFO","msg":"handler: started","stream_id":"gpyuprau"}
|
| 7 |
-
{"time":"2025-08-09T14:11:42.18522232Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 8 |
-
{"time":"2025-08-09T14:11:42.426873859Z","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading history steps 10395-10395, summary, console lines 1-3","runtime_seconds":0.229336216}],"total_operations":1}}
|
| 9 |
-
{"time":"2025-08-09T14:11:42.471754339Z","level":"INFO","msg":"stream: closing","id":"gpyuprau"}
|
| 10 |
-
{"time":"2025-08-09T14:11:42.471779588Z","level":"INFO","msg":"handler: closed","stream_id":"gpyuprau"}
|
| 11 |
-
{"time":"2025-08-09T14:11:42.472617651Z","level":"INFO","msg":"sender: closed","stream_id":"gpyuprau"}
|
| 12 |
-
{"time":"2025-08-09T14:11:42.472627835Z","level":"INFO","msg":"stream: closed","id":"gpyuprau"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/logs/debug.log
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
2025-08-09 07:46:02,871 INFO MainThread:511740 [wandb_setup.py:_flush():80] Current SDK version is 0.21.1
|
| 2 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_setup.py:_flush():80] Configure stats pid to 511740
|
| 3 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/.config/wandb/settings
|
| 4 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_setup.py:_flush():80] Loading settings from /fsx/byungjun/openvla-mini/wandb/settings
|
| 5 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_init.py:setup_run_log_directory():703] Logging user logs to runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7/wandb/run-20250809_074602-gpyuprau/logs/debug.log
|
| 7 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to runs/prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7/wandb/run-20250809_074602-gpyuprau/logs/debug-internal.log
|
| 8 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_init.py:init():830] calling init triggers
|
| 9 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
|
| 10 |
-
config: {'model': {'type': 'prism-qwen25-extra-dinosiglip-224px+3b', 'model_id': 'prism-qwen25-extra-dinosiglip-224px+3b', 'arch_specifier': 'no-align+fused-gelu-mlp', 'vision_backbone_id': 'dinosiglip-vit-so-224px', 'llm_backbone_id': 'qwen25-3b-extra', 'image_resize_strategy': 'resize-naive', 'llm_max_length': 32768, 'image_sequence_len': 1, 'align_epochs': 1, 'align_max_steps': None, 'align_save_every_n_steps': 10000, 'align_global_batch_size': 96, 'align_per_device_batch_size': 16, 'align_learning_rate': 0.001, 'align_weight_decay': 0.0, 'align_max_grad_norm': 1.0, 'align_lr_scheduler_type': 'linear-warmup+cosine-decay', 'align_warmup_ratio': 0.03, 'align_train_strategy': 'fsdp-shard-grad-op', 'finetune_epochs': 2, 'finetune_max_steps': None, 'finetune_save_every_n_steps': 10000, 'finetune_global_batch_size': 128, 'finetune_per_device_batch_size': 4, 'finetune_learning_rate': 2e-05, 'finetune_weight_decay': 0.1, 'finetune_max_grad_norm': 1.0, 'finetune_lr_scheduler_type': 'linear-warmup+cosine-decay', 'finetune_warmup_ratio': 0.03, 'finetune_train_strategy': 'fsdp-full-shard', 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'reduce_in_full_precision': False}, 'dataset': {'type': 'llava-v15', 'dataset_id': 'llava-v15', 'align_stage_components': ['download/llava-laion-cc-sbu-558k/chat.json', 'download/llava-laion-cc-sbu-558k'], 'finetune_stage_components': ['download/llava-v1.5-instruct/llava_v1_5_mix665k.json', 'download/llava-v1.5-instruct'], 'dataset_root_dir': 'data2'}, 'stage': 'finetune', 'pretrained_checkpoint': None, 'run_id': 'prism-qwen25-extra-dinosiglip-224px+3b+stage-finetune+x7', 'run_root_dir': 'runs', 'seed': 7, 'hf_token': '.hf_token', 'trackers': ['jsonl', 'wandb'], 'wandb_project': 'prismatic', 'wandb_entity': None, '_wandb': {}}
|
| 11 |
-
2025-08-09 07:46:02,872 INFO MainThread:511740 [wandb_init.py:init():871] starting backend
|
| 12 |
-
2025-08-09 07:46:03,170 INFO MainThread:511740 [wandb_init.py:init():874] sending inform_init request
|
| 13 |
-
2025-08-09 07:46:03,188 INFO MainThread:511740 [wandb_init.py:init():882] backend started and connected
|
| 14 |
-
2025-08-09 07:46:03,192 INFO MainThread:511740 [wandb_init.py:init():953] updated telemetry
|
| 15 |
-
2025-08-09 07:46:03,263 INFO MainThread:511740 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
|
| 16 |
-
2025-08-09 07:46:03,946 INFO MainThread:511740 [wandb_init.py:init():1029] starting run threads in backend
|
| 17 |
-
2025-08-09 07:46:04,803 INFO MainThread:511740 [wandb_run.py:_console_start():2494] atexit reg
|
| 18 |
-
2025-08-09 07:46:04,803 INFO MainThread:511740 [wandb_run.py:_redirect():2342] redirect: wrap_raw
|
| 19 |
-
2025-08-09 07:46:04,803 INFO MainThread:511740 [wandb_run.py:_redirect():2411] Wrapping output streams.
|
| 20 |
-
2025-08-09 07:46:04,804 INFO MainThread:511740 [wandb_run.py:_redirect():2434] Redirects installed.
|
| 21 |
-
2025-08-09 07:46:04,823 INFO MainThread:511740 [wandb_init.py:init():1075] run started, returning control to user process
|
| 22 |
-
2025-08-09 14:11:41,401 INFO MainThread:511740 [wandb_run.py:_finish():2260] finishing run happyhappy/prismatic/gpyuprau
|
| 23 |
-
2025-08-09 14:11:41,406 INFO MainThread:511740 [wandb_run.py:_atexit_cleanup():2459] got exitcode: 0
|
| 24 |
-
2025-08-09 14:11:41,407 INFO MainThread:511740 [wandb_run.py:_restore():2441] restore
|
| 25 |
-
2025-08-09 14:11:41,407 INFO MainThread:511740 [wandb_run.py:_restore():2447] restore done
|
| 26 |
-
2025-08-09 14:11:42,461 INFO MainThread:511740 [wandb_run.py:_footer_history_summary_info():3895] rendering history
|
| 27 |
-
2025-08-09 14:11:42,466 INFO MainThread:511740 [wandb_run.py:_footer_history_summary_info():3927] rendering summary
|
| 28 |
-
2025-08-09 14:11:42,466 INFO MainThread:511740 [wandb_run.py:_footer_sync_info():3856] logging synced files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20250809_074602-gpyuprau/run-gpyuprau.wandb
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:29c3269b94110e36b4df335f692afa5bd1810092f1d21e8fc12aee08f7e901fa
|
| 3 |
-
size 16169201
|
|
|
|
|
|
|
|
|
|
|
|