Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/converted.ckpt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/latest +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/zero_to_fp32.py +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug-internal.log +29 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug.log +24 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/config.yaml +168 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/output.log +229 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/requirements.txt +225 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-metadata.json +104 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-summary.json +1 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-internal.log +10 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug.log +24 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/run-615z4bme.wandb +3 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/config.yaml +222 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/output.log +35 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/requirements.txt +225 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-metadata.json +104 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-summary.json +1 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log +29 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log +24 -0
- all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/run-9cjzn0v3.wandb +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/converted.ckpt +3 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/run-6bkqzmou.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/run-gtrtcbb9.wandb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/run-6bkqzmou.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/run-gtrtcbb9.wandb filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/run-615z4bme.wandb filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/run-9cjzn0v3.wandb filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
all_checkpoints/stage2_07301646_2datasets_construct/wandb/run-20250730_175623-pbf2bxo6/run-pbf2bxo6.wandb filter=lfs diff=lfs merge=lfs -text
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab1a6ea1ac55ce9616532e761371379f0cb306bbf194e29d1a44cabe01dd4e3a
|
| 3 |
+
size 156403632
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c0aa50226009f0d8a6a23d3d6e98e194a4f2a944d95a7f46bcde8d3aebd98e5
|
| 3 |
+
size 156402992
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd423ebd707139e3d47d635ea2251cb7f655b5f24650085a7abf71a1f26f3d05
|
| 3 |
+
size 156403376
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:850b33a5acf35f5909aae8ad5672d7895c82f2024dc5b08d95b9ff338e08e519
|
| 3 |
+
size 156403120
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:942855deca23d990f0db7f0ce36aafba768e914d42db3b7f38810250bf9e2c25
|
| 3 |
+
size 156402416
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f14691a96f4b8d119ff12b6282ae802455fbec50f06f044bd597970f8331aca5
|
| 3 |
+
size 156403696
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60586788404b4008997d9fc2cee3b07a9a3f28324c0b0390bf54ef165acd2580
|
| 3 |
+
size 156402992
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b17dec7f7d4ac6aa06b05dfc462965b64f54028ed539e422ac2695da0c87828
|
| 3 |
+
size 156417904
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dbc87c01c9bbe626db3b00ce1ecc91c91503d810e852467d60441c897aa405eb
|
| 3 |
+
size 208795192
|
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/converted.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb1faf00f4930709a0ce83e6276411772ea0ce8357a6dde45856eb25d7ba33b7
|
| 3 |
+
size 417200356
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ab1a6ea1ac55ce9616532e761371379f0cb306bbf194e29d1a44cabe01dd4e3a
|
| 3 |
+
size 156403632
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c0aa50226009f0d8a6a23d3d6e98e194a4f2a944d95a7f46bcde8d3aebd98e5
|
| 3 |
+
size 156402992
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd423ebd707139e3d47d635ea2251cb7f655b5f24650085a7abf71a1f26f3d05
|
| 3 |
+
size 156403376
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:850b33a5acf35f5909aae8ad5672d7895c82f2024dc5b08d95b9ff338e08e519
|
| 3 |
+
size 156403120
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:942855deca23d990f0db7f0ce36aafba768e914d42db3b7f38810250bf9e2c25
|
| 3 |
+
size 156402416
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f14691a96f4b8d119ff12b6282ae802455fbec50f06f044bd597970f8331aca5
|
| 3 |
+
size 156403696
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60586788404b4008997d9fc2cee3b07a9a3f28324c0b0390bf54ef165acd2580
|
| 3 |
+
size 156402992
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b17dec7f7d4ac6aa06b05dfc462965b64f54028ed539e422ac2695da0c87828
|
| 3 |
+
size 156417904
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dbc87c01c9bbe626db3b00ce1ecc91c91503d810e852467d60441c897aa405eb
|
| 3 |
+
size 208795192
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/latest
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
|
| 3 |
+
size 10
|
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
|
| 3 |
+
size 33272
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-07-07T05:32:22.544190733+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-07-07T05:32:23.597067843+08:00","level":"INFO","msg":"created new stream","id":"9cjzn0v3"}
|
| 3 |
+
{"time":"2025-07-07T05:32:23.59711309+08:00","level":"INFO","msg":"stream: started","id":"9cjzn0v3"}
|
| 4 |
+
{"time":"2025-07-07T05:32:23.59715533+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9cjzn0v3"}
|
| 5 |
+
{"time":"2025-07-07T05:32:23.597176058+08:00","level":"INFO","msg":"handler: started","stream_id":"9cjzn0v3"}
|
| 6 |
+
{"time":"2025-07-07T05:32:23.597249736+08:00","level":"INFO","msg":"sender: started","stream_id":"9cjzn0v3"}
|
| 7 |
+
{"time":"2025-07-07T05:32:24.815832776+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-07-07T16:23:26.191588391+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:46082->172.67.193.61:443: read: connection timed out"}
|
| 9 |
+
{"time":"2025-07-08T07:36:01.662714436+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
|
| 10 |
+
{"time":"2025-07-08T07:39:35.510926561+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
|
| 11 |
+
{"time":"2025-07-09T00:01:13.718163538+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:52128->172.67.193.61:443: read: connection reset by peer"}
|
| 12 |
+
{"time":"2025-07-09T00:04:13.715227056+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:56708->104.21.20.172:443: read: connection reset by peer"}
|
| 13 |
+
{"time":"2025-07-09T00:41:59.079495986+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:39968->172.67.193.61:443: read: connection reset by peer"}
|
| 14 |
+
{"time":"2025-07-09T00:50:28.436723591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 15 |
+
{"time":"2025-07-09T01:04:28.736382048+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:48068->104.21.20.172:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-07-09T01:36:13.71400828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:33646->172.67.193.61:443: read: connection reset by peer"}
|
| 17 |
+
{"time":"2025-07-09T06:33:13.899246984+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 18 |
+
{"time":"2025-07-09T14:42:33.327607005+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:36168->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-07-09T22:35:09.035751509+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
|
| 20 |
+
{"time":"2025-07-09T23:18:37.03957561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56104->172.67.193.61:443: read: connection timed out"}
|
| 21 |
+
{"time":"2025-07-09T23:35:24.650683333+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56892->172.67.193.61:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-07-09T23:50:32.561736786+08:00","level":"INFO","msg":"stream: closing","id":"9cjzn0v3"}
|
| 23 |
+
{"time":"2025-07-09T23:50:32.56179589+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 24 |
+
{"time":"2025-07-09T23:50:32.564495033+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 25 |
+
{"time":"2025-07-09T23:50:38.466118847+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 26 |
+
{"time":"2025-07-09T23:50:40.207050581+08:00","level":"INFO","msg":"handler: closed","stream_id":"9cjzn0v3"}
|
| 27 |
+
{"time":"2025-07-09T23:50:40.207095276+08:00","level":"INFO","msg":"sender: closed","stream_id":"9cjzn0v3"}
|
| 28 |
+
{"time":"2025-07-09T23:50:40.207092571+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"9cjzn0v3"}
|
| 29 |
+
{"time":"2025-07-09T23:50:40.211547321+08:00","level":"INFO","msg":"stream: closed","id":"9cjzn0v3"}
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Configure stats pid to 9598
|
| 3 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log
|
| 7 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log
|
| 8 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-07-07 05:32:22,529 INFO MainThread:9598 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-07-07 05:32:22,531 INFO MainThread:9598 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-07-07 05:32:22,535 INFO MainThread:9598 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-07-07 05:32:22,540 INFO MainThread:9598 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-07-07 05:32:24,807 INFO MainThread:9598 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-07-07 05:32:24,961 INFO MainThread:9598 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-07-07 05:32:33,644 INFO MainThread:9598 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070513_2datasets_construct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 1, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 4, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
|
| 24 |
+
2025-07-09 23:50:32,550 INFO MsgRouterThr:9598 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/config.yaml
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
- "1": loss
|
| 10 |
+
"5": 1
|
| 11 |
+
"6":
|
| 12 |
+
- 1
|
| 13 |
+
- 3
|
| 14 |
+
"7": []
|
| 15 |
+
- "1": lr
|
| 16 |
+
"5": 1
|
| 17 |
+
"6":
|
| 18 |
+
- 1
|
| 19 |
+
- 3
|
| 20 |
+
"7": []
|
| 21 |
+
- "1": epoch
|
| 22 |
+
"5": 1
|
| 23 |
+
"6":
|
| 24 |
+
- 1
|
| 25 |
+
- 3
|
| 26 |
+
"7": []
|
| 27 |
+
python_version: 3.10.0
|
| 28 |
+
t:
|
| 29 |
+
"1":
|
| 30 |
+
- 1
|
| 31 |
+
- 5
|
| 32 |
+
- 9
|
| 33 |
+
- 11
|
| 34 |
+
- 33
|
| 35 |
+
- 41
|
| 36 |
+
- 49
|
| 37 |
+
- 53
|
| 38 |
+
- 55
|
| 39 |
+
- 63
|
| 40 |
+
- 103
|
| 41 |
+
"2":
|
| 42 |
+
- 1
|
| 43 |
+
- 5
|
| 44 |
+
- 9
|
| 45 |
+
- 11
|
| 46 |
+
- 33
|
| 47 |
+
- 41
|
| 48 |
+
- 49
|
| 49 |
+
- 53
|
| 50 |
+
- 55
|
| 51 |
+
- 63
|
| 52 |
+
- 103
|
| 53 |
+
"3":
|
| 54 |
+
- 7
|
| 55 |
+
- 23
|
| 56 |
+
- 55
|
| 57 |
+
- 66
|
| 58 |
+
"4": 3.10.0
|
| 59 |
+
"5": 0.19.11
|
| 60 |
+
"6": 4.52.3
|
| 61 |
+
"8":
|
| 62 |
+
- 5
|
| 63 |
+
"12": 0.19.11
|
| 64 |
+
"13": linux-x86_64
|
| 65 |
+
a_max_len:
|
| 66 |
+
value: 36
|
| 67 |
+
accelerator:
|
| 68 |
+
value: gpu
|
| 69 |
+
accumulate_grad_batches:
|
| 70 |
+
value: 1
|
| 71 |
+
batch_size:
|
| 72 |
+
value: 8
|
| 73 |
+
bert_name:
|
| 74 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 75 |
+
caption_eval_epoch:
|
| 76 |
+
value: 10
|
| 77 |
+
check_val_every_n_epoch:
|
| 78 |
+
value: 1
|
| 79 |
+
cross_attention_freq:
|
| 80 |
+
value: 2
|
| 81 |
+
devices:
|
| 82 |
+
value: 0,1,2,3,4,5,6,7
|
| 83 |
+
do_sample:
|
| 84 |
+
value: false
|
| 85 |
+
enable_flash:
|
| 86 |
+
value: false
|
| 87 |
+
enbale_gradient_checkpointing:
|
| 88 |
+
value: false
|
| 89 |
+
filename:
|
| 90 |
+
value: stage2_07070513_2datasets_construct
|
| 91 |
+
filter_side_qa:
|
| 92 |
+
value: false
|
| 93 |
+
inference_batch_size:
|
| 94 |
+
value: 4
|
| 95 |
+
init_checkpoint:
|
| 96 |
+
value: ""
|
| 97 |
+
init_lr:
|
| 98 |
+
value: 0.0001
|
| 99 |
+
llm_name:
|
| 100 |
+
value: /oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged
|
| 101 |
+
llm_tune:
|
| 102 |
+
value: mid_lora
|
| 103 |
+
lora_alpha:
|
| 104 |
+
value: 16
|
| 105 |
+
lora_dropout:
|
| 106 |
+
value: 0.1
|
| 107 |
+
lora_r:
|
| 108 |
+
value: 8
|
| 109 |
+
lr_decay_rate:
|
| 110 |
+
value: 0.9
|
| 111 |
+
max_epochs:
|
| 112 |
+
value: 10
|
| 113 |
+
max_inference_len:
|
| 114 |
+
value: 128
|
| 115 |
+
min_inference_len:
|
| 116 |
+
value: 1
|
| 117 |
+
min_lr:
|
| 118 |
+
value: 1e-05
|
| 119 |
+
mix_dataset:
|
| 120 |
+
value: true
|
| 121 |
+
mode:
|
| 122 |
+
value: train
|
| 123 |
+
num_beams:
|
| 124 |
+
value: 5
|
| 125 |
+
num_query_token:
|
| 126 |
+
value: 8
|
| 127 |
+
num_workers:
|
| 128 |
+
value: 8
|
| 129 |
+
peft_config:
|
| 130 |
+
value: ""
|
| 131 |
+
peft_dir:
|
| 132 |
+
value: ""
|
| 133 |
+
plm_model:
|
| 134 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 135 |
+
plm_tune:
|
| 136 |
+
value: freeze
|
| 137 |
+
precision:
|
| 138 |
+
value: bf16-mixed
|
| 139 |
+
prompt:
|
| 140 |
+
value: 'The protein has the following properties: '
|
| 141 |
+
prot_max_len:
|
| 142 |
+
value: 1024
|
| 143 |
+
q_max_len:
|
| 144 |
+
value: 29
|
| 145 |
+
root:
|
| 146 |
+
value: data
|
| 147 |
+
save_every_n_epochs:
|
| 148 |
+
value: 1
|
| 149 |
+
scheduler:
|
| 150 |
+
value: linear_warmup_cosine_lr
|
| 151 |
+
seed:
|
| 152 |
+
value: 42
|
| 153 |
+
stage1_path:
|
| 154 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt
|
| 155 |
+
stage2_path:
|
| 156 |
+
value: ""
|
| 157 |
+
strategy:
|
| 158 |
+
value: deepspeed
|
| 159 |
+
text_max_len:
|
| 160 |
+
value: 1024
|
| 161 |
+
use_wandb_logger:
|
| 162 |
+
value: true
|
| 163 |
+
warmup_lr:
|
| 164 |
+
value: 1e-06
|
| 165 |
+
warmup_steps:
|
| 166 |
+
value: 1000
|
| 167 |
+
weight_decay:
|
| 168 |
+
value: 0.05
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/output.log
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070513_2datasets_construct exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
-------------------------------------------
|
| 7 |
+
0 | blip2 | Blip2OPT | 7.9 B | train
|
| 8 |
+
-------------------------------------------
|
| 9 |
+
104 M Trainable params
|
| 10 |
+
7.8 B Non-trainable params
|
| 11 |
+
7.9 B Total params
|
| 12 |
+
31,459.025Total estimated model params size (MB)
|
| 13 |
+
174 Modules in train mode
|
| 14 |
+
1203 Modules in eval mode
|
| 15 |
+
Epoch 0: 1%|▌ | 138/13326 [03:46<6:00:39, 0.61it/s, v_num=4bme]
|
| 16 |
+
|
| 17 |
+
Detected KeyboardInterrupt, attempting graceful shutdown ...
|
| 18 |
+
Traceback (most recent call last):
|
| 19 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 20 |
+
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 21 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 22 |
+
return function(*args, **kwargs)
|
| 23 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 24 |
+
self._run(model, ckpt_path=ckpt_path)
|
| 25 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 26 |
+
results = self._run_stage()
|
| 27 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage
|
| 28 |
+
self.fit_loop.run()
|
| 29 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run
|
| 30 |
+
self.advance()
|
| 31 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance
|
| 32 |
+
self.epoch_loop.run(self._data_fetcher)
|
| 33 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run
|
| 34 |
+
self.advance(data_fetcher)
|
| 35 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance
|
| 36 |
+
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
|
| 37 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run
|
| 38 |
+
self._optimizer_step(batch_idx, closure)
|
| 39 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step
|
| 40 |
+
call._call_lightning_module_hook(
|
| 41 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook
|
| 42 |
+
output = fn(*args, **kwargs)
|
| 43 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step
|
| 44 |
+
optimizer.step(closure=optimizer_closure)
|
| 45 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step
|
| 46 |
+
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
|
| 47 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step
|
| 48 |
+
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
|
| 49 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step
|
| 50 |
+
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
|
| 51 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step
|
| 52 |
+
closure_result = closure()
|
| 53 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__
|
| 54 |
+
self._result = self.closure(*args, **kwargs)
|
| 55 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 56 |
+
return func(*args, **kwargs)
|
| 57 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure
|
| 58 |
+
step_output = self._step_fn()
|
| 59 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step
|
| 60 |
+
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
|
| 61 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 62 |
+
output = fn(*args, **kwargs)
|
| 63 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step
|
| 64 |
+
return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
|
| 65 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 66 |
+
wrapper_output = wrapper_module(*args, **kwargs)
|
| 67 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 68 |
+
return self._call_impl(*args, **kwargs)
|
| 69 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 70 |
+
return forward_call(*args, **kwargs)
|
| 71 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 72 |
+
ret_val = func(*args, **kwargs)
|
| 73 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 74 |
+
loss = self.module(*inputs, **kwargs)
|
| 75 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 76 |
+
return self._call_impl(*args, **kwargs)
|
| 77 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 78 |
+
return inner()
|
| 79 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 80 |
+
result = forward_call(*args, **kwargs)
|
| 81 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 82 |
+
out = method(*_args, **_kwargs)
|
| 83 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 284, in training_step
|
| 84 |
+
self.log("loss", float(loss), batch_size=batch_size, sync_dist=True)
|
| 85 |
+
KeyboardInterrupt
|
| 86 |
+
|
| 87 |
+
During handling of the above exception, another exception occurred:
|
| 88 |
+
|
| 89 |
+
Traceback (most recent call last):
|
| 90 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 131, in <module>
|
| 91 |
+
main(get_args())
|
| 92 |
+
File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main
|
| 93 |
+
trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path)
|
| 94 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 95 |
+
call._call_and_handle_interrupt(
|
| 96 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 61, in _call_and_handle_interrupt
|
| 97 |
+
trainer._teardown()
|
| 98 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _teardown
|
| 99 |
+
self.strategy.teardown()
|
| 100 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 419, in teardown
|
| 101 |
+
super().teardown()
|
| 102 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/parallel.py", line 134, in teardown
|
| 103 |
+
super().teardown()
|
| 104 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 536, in teardown
|
| 105 |
+
self.lightning_module.cpu()
|
| 106 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lightning_fabric/utilities/device_dtype_mixin.py", line 82, in cpu
|
| 107 |
+
return super().cpu()
|
| 108 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in cpu
|
| 109 |
+
return self._apply(lambda t: t.cpu())
|
| 110 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
|
| 111 |
+
module._apply(fn)
|
| 112 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
|
| 113 |
+
module._apply(fn)
|
| 114 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
|
| 115 |
+
module._apply(fn)
|
| 116 |
+
[Previous line repeated 4 more times]
|
| 117 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 930, in _apply
|
| 118 |
+
param_applied = fn(param)
|
| 119 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in <lambda>
|
| 120 |
+
return self._apply(lambda t: t.cpu())
|
| 121 |
+
File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
|
| 122 |
+
_error_if_any_worker_fails()
|
| 123 |
+
RuntimeError: DataLoader worker (pid 8028) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.
|
| 124 |
+
[rank0]: Traceback (most recent call last):
|
| 125 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
|
| 126 |
+
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
|
| 127 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
|
| 128 |
+
[rank0]: return function(*args, **kwargs)
|
| 129 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
|
| 130 |
+
[rank0]: self._run(model, ckpt_path=ckpt_path)
|
| 131 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
|
| 132 |
+
[rank0]: results = self._run_stage()
|
| 133 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage
|
| 134 |
+
[rank0]: self.fit_loop.run()
|
| 135 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run
|
| 136 |
+
[rank0]: self.advance()
|
| 137 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance
|
| 138 |
+
[rank0]: self.epoch_loop.run(self._data_fetcher)
|
| 139 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run
|
| 140 |
+
[rank0]: self.advance(data_fetcher)
|
| 141 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance
|
| 142 |
+
[rank0]: batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
|
| 143 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run
|
| 144 |
+
[rank0]: self._optimizer_step(batch_idx, closure)
|
| 145 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step
|
| 146 |
+
[rank0]: call._call_lightning_module_hook(
|
| 147 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook
|
| 148 |
+
[rank0]: output = fn(*args, **kwargs)
|
| 149 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step
|
| 150 |
+
[rank0]: optimizer.step(closure=optimizer_closure)
|
| 151 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step
|
| 152 |
+
[rank0]: step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
|
| 153 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step
|
| 154 |
+
[rank0]: optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
|
| 155 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step
|
| 156 |
+
[rank0]: return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
|
| 157 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step
|
| 158 |
+
[rank0]: closure_result = closure()
|
| 159 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__
|
| 160 |
+
[rank0]: self._result = self.closure(*args, **kwargs)
|
| 161 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
|
| 162 |
+
[rank0]: return func(*args, **kwargs)
|
| 163 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure
|
| 164 |
+
[rank0]: step_output = self._step_fn()
|
| 165 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step
|
| 166 |
+
[rank0]: training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
|
| 167 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
|
| 168 |
+
[rank0]: output = fn(*args, **kwargs)
|
| 169 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step
|
| 170 |
+
[rank0]: return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
|
| 171 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
|
| 172 |
+
[rank0]: wrapper_output = wrapper_module(*args, **kwargs)
|
| 173 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 174 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 175 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
|
| 176 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 177 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
|
| 178 |
+
[rank0]: ret_val = func(*args, **kwargs)
|
| 179 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
|
| 180 |
+
[rank0]: loss = self.module(*inputs, **kwargs)
|
| 181 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
|
| 182 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 183 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
|
| 184 |
+
[rank0]: return inner()
|
| 185 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
|
| 186 |
+
[rank0]: result = forward_call(*args, **kwargs)
|
| 187 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
|
| 188 |
+
[rank0]: out = method(*_args, **_kwargs)
|
| 189 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 284, in training_step
|
| 190 |
+
[rank0]: self.log("loss", float(loss), batch_size=batch_size, sync_dist=True)
|
| 191 |
+
[rank0]: KeyboardInterrupt
|
| 192 |
+
|
| 193 |
+
[rank0]: During handling of the above exception, another exception occurred:
|
| 194 |
+
|
| 195 |
+
[rank0]: Traceback (most recent call last):
|
| 196 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 131, in <module>
|
| 197 |
+
[rank0]: main(get_args())
|
| 198 |
+
[rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main
|
| 199 |
+
[rank0]: trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path)
|
| 200 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
|
| 201 |
+
[rank0]: call._call_and_handle_interrupt(
|
| 202 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 61, in _call_and_handle_interrupt
|
| 203 |
+
[rank0]: trainer._teardown()
|
| 204 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _teardown
|
| 205 |
+
[rank0]: self.strategy.teardown()
|
| 206 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 419, in teardown
|
| 207 |
+
[rank0]: super().teardown()
|
| 208 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/parallel.py", line 134, in teardown
|
| 209 |
+
[rank0]: super().teardown()
|
| 210 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 536, in teardown
|
| 211 |
+
[rank0]: self.lightning_module.cpu()
|
| 212 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lightning_fabric/utilities/device_dtype_mixin.py", line 82, in cpu
|
| 213 |
+
[rank0]: return super().cpu()
|
| 214 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in cpu
|
| 215 |
+
[rank0]: return self._apply(lambda t: t.cpu())
|
| 216 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
|
| 217 |
+
[rank0]: module._apply(fn)
|
| 218 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
|
| 219 |
+
[rank0]: module._apply(fn)
|
| 220 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
|
| 221 |
+
[rank0]: module._apply(fn)
|
| 222 |
+
[rank0]: [Previous line repeated 4 more times]
|
| 223 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 930, in _apply
|
| 224 |
+
[rank0]: param_applied = fn(param)
|
| 225 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in <lambda>
|
| 226 |
+
[rank0]: return self._apply(lambda t: t.cpu())
|
| 227 |
+
[rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
|
| 228 |
+
[rank0]: _error_if_any_worker_fails()
|
| 229 |
+
[rank0]: RuntimeError: DataLoader worker (pid 8028) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gitdb==4.0.12
|
| 2 |
+
smmap==5.0.2
|
| 3 |
+
wcwidth==0.2.13
|
| 4 |
+
streamlit==1.45.1
|
| 5 |
+
antlr4-python3-runtime==4.9.3
|
| 6 |
+
MarkupSafe==3.0.2
|
| 7 |
+
markdown-it-py==3.0.0
|
| 8 |
+
PyYAML==6.0.2
|
| 9 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 10 |
+
text-unidecode==1.3
|
| 11 |
+
msgpack==1.1.0
|
| 12 |
+
pillow==11.2.1
|
| 13 |
+
wrapt==1.17.2
|
| 14 |
+
tifffile==2025.5.10
|
| 15 |
+
nvidia-curand-cu12==10.3.5.147
|
| 16 |
+
networkx==3.4.2
|
| 17 |
+
fonttools==4.58.0
|
| 18 |
+
plotly==6.1.1
|
| 19 |
+
matplotlib==3.10.3
|
| 20 |
+
certifi==2025.4.26
|
| 21 |
+
altair==5.5.0
|
| 22 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 23 |
+
ninja==1.11.1.4
|
| 24 |
+
tzdata==2025.2
|
| 25 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 26 |
+
weasel==0.4.1
|
| 27 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 28 |
+
jedi==0.19.2
|
| 29 |
+
GitPython==3.1.44
|
| 30 |
+
pandas==2.2.3
|
| 31 |
+
python-slugify==8.0.4
|
| 32 |
+
omegaconf==2.3.0
|
| 33 |
+
kiwisolver==1.4.8
|
| 34 |
+
tenacity==9.1.2
|
| 35 |
+
pydantic==2.11.5
|
| 36 |
+
async-timeout==5.0.1
|
| 37 |
+
tqdm==4.67.1
|
| 38 |
+
confection==0.1.5
|
| 39 |
+
six==1.17.0
|
| 40 |
+
portalocker==3.1.1
|
| 41 |
+
regex==2024.11.6
|
| 42 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 43 |
+
packaging==24.2
|
| 44 |
+
annotated-types==0.7.0
|
| 45 |
+
salesforce-lavis==1.0.2
|
| 46 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 47 |
+
lightning-utilities==0.14.3
|
| 48 |
+
pytz==2025.2
|
| 49 |
+
smart-open==7.1.0
|
| 50 |
+
cachetools==5.5.2
|
| 51 |
+
nltk==3.9.1
|
| 52 |
+
torchmetrics==1.7.1
|
| 53 |
+
pexpect==4.9.0
|
| 54 |
+
jsonschema-specifications==2025.4.1
|
| 55 |
+
Jinja2==3.1.6
|
| 56 |
+
hjson==3.1.0
|
| 57 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 58 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 59 |
+
decord==0.6.0
|
| 60 |
+
joblib==1.5.1
|
| 61 |
+
kaggle==1.7.4.5
|
| 62 |
+
psutil==7.0.0
|
| 63 |
+
absl-py==2.2.2
|
| 64 |
+
Pygments==2.19.1
|
| 65 |
+
idna==3.10
|
| 66 |
+
aiohappyeyeballs==2.6.1
|
| 67 |
+
tornado==6.5.1
|
| 68 |
+
cycler==0.12.1
|
| 69 |
+
deepspeed==0.16.10+b666844f
|
| 70 |
+
torchvision==0.21.0
|
| 71 |
+
exceptiongroup==1.3.0
|
| 72 |
+
cfgv==3.4.0
|
| 73 |
+
py-cpuinfo==9.0.0
|
| 74 |
+
webdataset==0.2.111
|
| 75 |
+
murmurhash==1.0.13
|
| 76 |
+
asttokens==3.0.0
|
| 77 |
+
spacy==3.8.7
|
| 78 |
+
blinker==1.9.0
|
| 79 |
+
python-dateutil==2.9.0.post0
|
| 80 |
+
prompt_toolkit==3.0.51
|
| 81 |
+
referencing==0.36.2
|
| 82 |
+
contourpy==1.3.2
|
| 83 |
+
mpmath==1.3.0
|
| 84 |
+
thinc==8.3.6
|
| 85 |
+
pycocotools==2.0.8
|
| 86 |
+
python-magic==0.4.27
|
| 87 |
+
fairscale==0.4.4
|
| 88 |
+
nodeenv==1.9.1
|
| 89 |
+
identify==2.6.12
|
| 90 |
+
ftfy==6.3.1
|
| 91 |
+
spacy-legacy==3.0.12
|
| 92 |
+
cymem==2.0.11
|
| 93 |
+
typing-inspection==0.4.1
|
| 94 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 95 |
+
filelock==3.18.0
|
| 96 |
+
language_data==1.3.0
|
| 97 |
+
iopath==0.1.10
|
| 98 |
+
pre_commit==4.2.0
|
| 99 |
+
toml==0.10.2
|
| 100 |
+
lazy_loader==0.4
|
| 101 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 102 |
+
nvidia-nvtx-cu12==12.4.127
|
| 103 |
+
rouge_score==0.1.2
|
| 104 |
+
pycocoevalcap==1.2
|
| 105 |
+
pyparsing==3.2.3
|
| 106 |
+
mdurl==0.1.2
|
| 107 |
+
pure_eval==0.2.3
|
| 108 |
+
ipython==8.36.0
|
| 109 |
+
langcodes==3.5.0
|
| 110 |
+
distlib==0.3.9
|
| 111 |
+
pydeck==0.9.1
|
| 112 |
+
traitlets==5.14.3
|
| 113 |
+
decorator==5.2.1
|
| 114 |
+
requests==2.32.3
|
| 115 |
+
pydantic_core==2.33.2
|
| 116 |
+
matplotlib-inline==0.1.7
|
| 117 |
+
hf-xet==1.1.2
|
| 118 |
+
opendatasets==0.1.22
|
| 119 |
+
attrs==25.3.0
|
| 120 |
+
urllib3==2.4.0
|
| 121 |
+
typing_extensions==4.13.2
|
| 122 |
+
bleach==6.2.0
|
| 123 |
+
rich==14.0.0
|
| 124 |
+
imageio==2.37.0
|
| 125 |
+
yarl==1.20.0
|
| 126 |
+
platformdirs==4.3.8
|
| 127 |
+
multidict==6.4.4
|
| 128 |
+
catalogue==2.0.10
|
| 129 |
+
wasabi==1.1.3
|
| 130 |
+
scikit-image==0.25.2
|
| 131 |
+
blis==1.3.0
|
| 132 |
+
pyarrow==20.0.0
|
| 133 |
+
parso==0.8.4
|
| 134 |
+
rpds-py==0.25.1
|
| 135 |
+
opencv-python-headless==4.5.5.64
|
| 136 |
+
braceexpand==0.1.7
|
| 137 |
+
frozenlist==1.6.0
|
| 138 |
+
numpy==2.2.6
|
| 139 |
+
cloudpathlib==0.21.1
|
| 140 |
+
srsly==2.5.1
|
| 141 |
+
webencodings==0.5.1
|
| 142 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 143 |
+
einops==0.8.1
|
| 144 |
+
setuptools==78.1.1
|
| 145 |
+
nvidia-nccl-cu12==2.21.5
|
| 146 |
+
ptyprocess==0.7.0
|
| 147 |
+
torch==2.6.0
|
| 148 |
+
scipy==1.15.3
|
| 149 |
+
nvidia-ml-py==12.575.51
|
| 150 |
+
aiosignal==1.3.2
|
| 151 |
+
virtualenv==20.31.2
|
| 152 |
+
protobuf==6.31.0
|
| 153 |
+
contexttimer==0.3.3
|
| 154 |
+
marisa-trie==1.2.1
|
| 155 |
+
shellingham==1.5.4
|
| 156 |
+
charset-normalizer==3.4.2
|
| 157 |
+
propcache==0.3.1
|
| 158 |
+
executing==2.2.0
|
| 159 |
+
pytorch-lightning==2.5.1.post0
|
| 160 |
+
stack-data==0.6.3
|
| 161 |
+
sentencepiece==0.2.0
|
| 162 |
+
sympy==1.13.1
|
| 163 |
+
wheel==0.45.1
|
| 164 |
+
safetensors==0.5.3
|
| 165 |
+
triton==3.2.0
|
| 166 |
+
watchdog==6.0.0
|
| 167 |
+
spacy-loggers==1.0.5
|
| 168 |
+
timm==0.4.12
|
| 169 |
+
docker-pycreds==0.4.0
|
| 170 |
+
setproctitle==1.3.6
|
| 171 |
+
jmespath==0.10.0
|
| 172 |
+
pycryptodome==3.23.0
|
| 173 |
+
opendelta==0.3.2
|
| 174 |
+
aliyun-python-sdk-core==2.16.0
|
| 175 |
+
dill==0.3.8
|
| 176 |
+
xxhash==3.5.0
|
| 177 |
+
crcmod==1.7
|
| 178 |
+
aiohttp==3.12.2
|
| 179 |
+
sentry-sdk==2.29.1
|
| 180 |
+
huggingface-hub==0.32.1
|
| 181 |
+
jaraco.functools==4.1.0
|
| 182 |
+
pathlib==1.0.1
|
| 183 |
+
multiprocess==0.70.16
|
| 184 |
+
flash-attn==2.7.1.post1
|
| 185 |
+
jsonschema==4.24.0
|
| 186 |
+
datasets==3.6.0
|
| 187 |
+
cffi==1.17.1
|
| 188 |
+
pycparser==2.22
|
| 189 |
+
fsspec==2025.3.0
|
| 190 |
+
more-itertools==10.7.0
|
| 191 |
+
cryptography==45.0.3
|
| 192 |
+
tokenizers==0.21.1
|
| 193 |
+
cheroot==10.0.1
|
| 194 |
+
pip==25.1.1
|
| 195 |
+
preshed==3.0.10
|
| 196 |
+
transformers==4.52.3
|
| 197 |
+
oss2==2.15.0
|
| 198 |
+
yacs==0.1.8
|
| 199 |
+
wandb==0.19.11
|
| 200 |
+
bigmodelvis==0.0.1
|
| 201 |
+
web.py==0.62
|
| 202 |
+
opencv-python==4.11.0.86
|
| 203 |
+
threadpoolctl==3.6.0
|
| 204 |
+
typer==0.16.0
|
| 205 |
+
narwhals==1.41.0
|
| 206 |
+
delta-center-client==0.0.4
|
| 207 |
+
aliyun-python-sdk-kms==2.16.5
|
| 208 |
+
click==8.2.1
|
| 209 |
+
scikit-learn==1.6.1
|
| 210 |
+
jaraco.text==3.12.1
|
| 211 |
+
autocommand==2.2.2
|
| 212 |
+
packaging==24.2
|
| 213 |
+
jaraco.context==5.3.0
|
| 214 |
+
tomli==2.0.1
|
| 215 |
+
typeguard==4.3.0
|
| 216 |
+
zipp==3.19.2
|
| 217 |
+
backports.tarfile==1.2.0
|
| 218 |
+
typing_extensions==4.12.2
|
| 219 |
+
jaraco.collections==5.1.0
|
| 220 |
+
inflect==7.3.1
|
| 221 |
+
more-itertools==10.3.0
|
| 222 |
+
jaraco.functools==4.0.1
|
| 223 |
+
importlib_metadata==8.0.0
|
| 224 |
+
platformdirs==4.2.2
|
| 225 |
+
wheel==0.45.1
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-07-06T21:21:04.133606Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage2_07070513_2datasets_construct",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--save_every_n_epochs",
|
| 15 |
+
"1",
|
| 16 |
+
"--max_epochs",
|
| 17 |
+
"10",
|
| 18 |
+
"--batch_size",
|
| 19 |
+
"8",
|
| 20 |
+
"--precision",
|
| 21 |
+
"bf16-mixed",
|
| 22 |
+
"--num_workers",
|
| 23 |
+
"8",
|
| 24 |
+
"--plm_model",
|
| 25 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 26 |
+
"--bert_name",
|
| 27 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 28 |
+
"--llm_name",
|
| 29 |
+
"/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 30 |
+
"--llm_tune",
|
| 31 |
+
"mid_lora",
|
| 32 |
+
"--mix_dataset",
|
| 33 |
+
"--stage1_path",
|
| 34 |
+
"/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt",
|
| 35 |
+
"--use_wandb_logger"
|
| 36 |
+
],
|
| 37 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py",
|
| 38 |
+
"codePath": "stage2.py",
|
| 39 |
+
"email": "gia0603yucca@gmail.com",
|
| 40 |
+
"root": "./all_checkpoints/stage2_07070513_2datasets_construct/",
|
| 41 |
+
"host": "dsw-251511-69b5b47496-4bcxh",
|
| 42 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 43 |
+
"codePathLocal": "stage2.py",
|
| 44 |
+
"cpu_count": 64,
|
| 45 |
+
"cpu_count_logical": 64,
|
| 46 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 47 |
+
"gpu_count": 8,
|
| 48 |
+
"disk": {
|
| 49 |
+
"/": {
|
| 50 |
+
"total": "1623302262784",
|
| 51 |
+
"used": "1260380160"
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"memory": {
|
| 55 |
+
"total": "549755813888"
|
| 56 |
+
},
|
| 57 |
+
"cpu": {
|
| 58 |
+
"count": 64,
|
| 59 |
+
"countLogical": 64
|
| 60 |
+
},
|
| 61 |
+
"gpu_nvidia": [
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 64 |
+
"memoryTotal": "85198045184",
|
| 65 |
+
"architecture": "Ampere"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 69 |
+
"memoryTotal": "85198045184",
|
| 70 |
+
"architecture": "Ampere"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 74 |
+
"memoryTotal": "85198045184",
|
| 75 |
+
"architecture": "Ampere"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 79 |
+
"memoryTotal": "85198045184",
|
| 80 |
+
"architecture": "Ampere"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 84 |
+
"memoryTotal": "85198045184",
|
| 85 |
+
"architecture": "Ampere"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 89 |
+
"memoryTotal": "85198045184",
|
| 90 |
+
"architecture": "Ampere"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 94 |
+
"memoryTotal": "85198045184",
|
| 95 |
+
"architecture": "Ampere"
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 99 |
+
"memoryTotal": "85198045184",
|
| 100 |
+
"architecture": "Ampere"
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
"cudaVersion": "12.1"
|
| 104 |
+
}
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"epoch":0,"trainer/global_step":99,"_timestamp":1.7518373296110501e+09,"_runtime":465.477642119,"_step":1,"loss":1.32979416847229,"_wandb":{"runtime":533},"lr":1.0800999916682485e-05}
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-07-07T05:21:04.137926867+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-07-07T05:21:05.224571234+08:00","level":"INFO","msg":"created new stream","id":"615z4bme"}
|
| 3 |
+
{"time":"2025-07-07T05:21:05.224615496+08:00","level":"INFO","msg":"stream: started","id":"615z4bme"}
|
| 4 |
+
{"time":"2025-07-07T05:21:05.224662595+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"615z4bme"}
|
| 5 |
+
{"time":"2025-07-07T05:21:05.224708291+08:00","level":"INFO","msg":"handler: started","stream_id":"615z4bme"}
|
| 6 |
+
{"time":"2025-07-07T05:21:05.22467591+08:00","level":"INFO","msg":"sender: started","stream_id":"615z4bme"}
|
| 7 |
+
{"time":"2025-07-07T05:21:06.409908065+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-07-07T05:29:57.639658414+08:00","level":"INFO","msg":"stream: closing","id":"615z4bme"}
|
| 9 |
+
{"time":"2025-07-07T05:29:57.639718652+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 10 |
+
{"time":"2025-07-07T05:29:57.64116529+08:00","level":"INFO","msg":"Stopped system monitor"}
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Configure stats pid to 2481
|
| 3 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug.log
|
| 7 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-internal.log
|
| 8 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-07-07 05:21:04,129 INFO MainThread:2481 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-07-07 05:21:04,132 INFO MainThread:2481 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-07-07 05:21:04,135 INFO MainThread:2481 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-07-07 05:21:04,138 INFO MainThread:2481 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-07-07 05:21:06,399 INFO MainThread:2481 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-07-07 05:21:06,613 INFO MainThread:2481 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-07-07 05:21:06,614 INFO MainThread:2481 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-07-07 05:21:06,619 INFO MainThread:2481 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-07-07 05:21:06,619 INFO MainThread:2481 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-07-07 05:21:06,620 INFO MainThread:2481 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-07-07 05:25:59,072 INFO MainThread:2481 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070513_2datasets_construct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 1, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 8, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
|
| 24 |
+
2025-07-07 05:29:57,621 INFO MsgRouterThr:2481 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/run-615z4bme.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5d1bbe7396b4e8ab1a6d6cf3abef6965bcd254b0974ca9172975647a4cc3e5a
|
| 3 |
+
size 196608
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/config.yaml
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.11
|
| 4 |
+
m:
|
| 5 |
+
- "1": trainer/global_step
|
| 6 |
+
"6":
|
| 7 |
+
- 3
|
| 8 |
+
"7": []
|
| 9 |
+
- "1": loss
|
| 10 |
+
"5": 1
|
| 11 |
+
"6":
|
| 12 |
+
- 1
|
| 13 |
+
- 3
|
| 14 |
+
"7": []
|
| 15 |
+
- "1": lr
|
| 16 |
+
"5": 1
|
| 17 |
+
"6":
|
| 18 |
+
- 1
|
| 19 |
+
- 3
|
| 20 |
+
"7": []
|
| 21 |
+
- "1": dataset0/rouge_1
|
| 22 |
+
"5": 1
|
| 23 |
+
"6":
|
| 24 |
+
- 1
|
| 25 |
+
- 3
|
| 26 |
+
"7": []
|
| 27 |
+
- "1": dataset0/rouge_l
|
| 28 |
+
"5": 1
|
| 29 |
+
"6":
|
| 30 |
+
- 1
|
| 31 |
+
- 3
|
| 32 |
+
"7": []
|
| 33 |
+
- "1": dataset0/meteor_score
|
| 34 |
+
"5": 1
|
| 35 |
+
"6":
|
| 36 |
+
- 1
|
| 37 |
+
- 3
|
| 38 |
+
"7": []
|
| 39 |
+
- "1": dataset0/bleu2
|
| 40 |
+
"5": 1
|
| 41 |
+
"6":
|
| 42 |
+
- 1
|
| 43 |
+
- 3
|
| 44 |
+
"7": []
|
| 45 |
+
- "1": dataset0/bleu4
|
| 46 |
+
"5": 1
|
| 47 |
+
"6":
|
| 48 |
+
- 1
|
| 49 |
+
- 3
|
| 50 |
+
"7": []
|
| 51 |
+
- "1": epoch
|
| 52 |
+
"5": 1
|
| 53 |
+
"6":
|
| 54 |
+
- 1
|
| 55 |
+
- 3
|
| 56 |
+
"7": []
|
| 57 |
+
- "1": dataloader2/val loss/dataloader_idx_2
|
| 58 |
+
"5": 1
|
| 59 |
+
"6":
|
| 60 |
+
- 1
|
| 61 |
+
- 3
|
| 62 |
+
"7": []
|
| 63 |
+
- "1": dataloader0/val loss/dataloader_idx_0
|
| 64 |
+
"5": 1
|
| 65 |
+
"6":
|
| 66 |
+
- 1
|
| 67 |
+
- 3
|
| 68 |
+
"7": []
|
| 69 |
+
- "1": dataset0/acc
|
| 70 |
+
"5": 1
|
| 71 |
+
"6":
|
| 72 |
+
- 1
|
| 73 |
+
- 3
|
| 74 |
+
"7": []
|
| 75 |
+
- "1": dataset0/rouge_2
|
| 76 |
+
"5": 1
|
| 77 |
+
"6":
|
| 78 |
+
- 1
|
| 79 |
+
- 3
|
| 80 |
+
"7": []
|
| 81 |
+
python_version: 3.10.0
|
| 82 |
+
t:
|
| 83 |
+
"1":
|
| 84 |
+
- 1
|
| 85 |
+
- 5
|
| 86 |
+
- 9
|
| 87 |
+
- 11
|
| 88 |
+
- 33
|
| 89 |
+
- 41
|
| 90 |
+
- 49
|
| 91 |
+
- 53
|
| 92 |
+
- 55
|
| 93 |
+
- 63
|
| 94 |
+
- 103
|
| 95 |
+
"2":
|
| 96 |
+
- 1
|
| 97 |
+
- 5
|
| 98 |
+
- 9
|
| 99 |
+
- 11
|
| 100 |
+
- 33
|
| 101 |
+
- 41
|
| 102 |
+
- 49
|
| 103 |
+
- 53
|
| 104 |
+
- 55
|
| 105 |
+
- 63
|
| 106 |
+
- 103
|
| 107 |
+
"3":
|
| 108 |
+
- 7
|
| 109 |
+
- 23
|
| 110 |
+
- 55
|
| 111 |
+
- 66
|
| 112 |
+
"4": 3.10.0
|
| 113 |
+
"5": 0.19.11
|
| 114 |
+
"6": 4.52.3
|
| 115 |
+
"8":
|
| 116 |
+
- 5
|
| 117 |
+
"12": 0.19.11
|
| 118 |
+
"13": linux-x86_64
|
| 119 |
+
a_max_len:
|
| 120 |
+
value: 36
|
| 121 |
+
accelerator:
|
| 122 |
+
value: gpu
|
| 123 |
+
accumulate_grad_batches:
|
| 124 |
+
value: 1
|
| 125 |
+
batch_size:
|
| 126 |
+
value: 4
|
| 127 |
+
bert_name:
|
| 128 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
|
| 129 |
+
caption_eval_epoch:
|
| 130 |
+
value: 10
|
| 131 |
+
check_val_every_n_epoch:
|
| 132 |
+
value: 1
|
| 133 |
+
cross_attention_freq:
|
| 134 |
+
value: 2
|
| 135 |
+
devices:
|
| 136 |
+
value: 0,1,2,3,4,5,6,7
|
| 137 |
+
do_sample:
|
| 138 |
+
value: false
|
| 139 |
+
enable_flash:
|
| 140 |
+
value: false
|
| 141 |
+
enbale_gradient_checkpointing:
|
| 142 |
+
value: false
|
| 143 |
+
filename:
|
| 144 |
+
value: stage2_07070513_2datasets_construct
|
| 145 |
+
filter_side_qa:
|
| 146 |
+
value: false
|
| 147 |
+
inference_batch_size:
|
| 148 |
+
value: 4
|
| 149 |
+
init_checkpoint:
|
| 150 |
+
value: ""
|
| 151 |
+
init_lr:
|
| 152 |
+
value: 0.0001
|
| 153 |
+
llm_name:
|
| 154 |
+
value: /oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged
|
| 155 |
+
llm_tune:
|
| 156 |
+
value: mid_lora
|
| 157 |
+
lora_alpha:
|
| 158 |
+
value: 16
|
| 159 |
+
lora_dropout:
|
| 160 |
+
value: 0.1
|
| 161 |
+
lora_r:
|
| 162 |
+
value: 8
|
| 163 |
+
lr_decay_rate:
|
| 164 |
+
value: 0.9
|
| 165 |
+
max_epochs:
|
| 166 |
+
value: 10
|
| 167 |
+
max_inference_len:
|
| 168 |
+
value: 128
|
| 169 |
+
min_inference_len:
|
| 170 |
+
value: 1
|
| 171 |
+
min_lr:
|
| 172 |
+
value: 1e-05
|
| 173 |
+
mix_dataset:
|
| 174 |
+
value: true
|
| 175 |
+
mode:
|
| 176 |
+
value: train
|
| 177 |
+
num_beams:
|
| 178 |
+
value: 5
|
| 179 |
+
num_query_token:
|
| 180 |
+
value: 8
|
| 181 |
+
num_workers:
|
| 182 |
+
value: 8
|
| 183 |
+
peft_config:
|
| 184 |
+
value: ""
|
| 185 |
+
peft_dir:
|
| 186 |
+
value: ""
|
| 187 |
+
plm_model:
|
| 188 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
|
| 189 |
+
plm_tune:
|
| 190 |
+
value: freeze
|
| 191 |
+
precision:
|
| 192 |
+
value: bf16-mixed
|
| 193 |
+
prompt:
|
| 194 |
+
value: 'The protein has the following properties: '
|
| 195 |
+
prot_max_len:
|
| 196 |
+
value: 1024
|
| 197 |
+
q_max_len:
|
| 198 |
+
value: 29
|
| 199 |
+
root:
|
| 200 |
+
value: data
|
| 201 |
+
save_every_n_epochs:
|
| 202 |
+
value: 1
|
| 203 |
+
scheduler:
|
| 204 |
+
value: linear_warmup_cosine_lr
|
| 205 |
+
seed:
|
| 206 |
+
value: 42
|
| 207 |
+
stage1_path:
|
| 208 |
+
value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt
|
| 209 |
+
stage2_path:
|
| 210 |
+
value: ""
|
| 211 |
+
strategy:
|
| 212 |
+
value: deepspeed
|
| 213 |
+
text_max_len:
|
| 214 |
+
value: 1024
|
| 215 |
+
use_wandb_logger:
|
| 216 |
+
value: true
|
| 217 |
+
warmup_lr:
|
| 218 |
+
value: 1e-06
|
| 219 |
+
warmup_steps:
|
| 220 |
+
value: 1000
|
| 221 |
+
weight_decay:
|
| 222 |
+
value: 0.05
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/output.log
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070513_2datasets_construct exists and is not empty.
|
| 2 |
+
Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
|
| 3 |
+
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
|
| 4 |
+
|
| 5 |
+
| Name | Type | Params | Mode
|
| 6 |
+
-------------------------------------------
|
| 7 |
+
0 | blip2 | Blip2OPT | 7.9 B | train
|
| 8 |
+
-------------------------------------------
|
| 9 |
+
104 M Trainable params
|
| 10 |
+
7.8 B Non-trainable params
|
| 11 |
+
7.9 B Total params
|
| 12 |
+
31,459.025Total estimated model params size (MB)
|
| 13 |
+
174 Modules in train mode
|
| 14 |
+
1203 Modules in eval mode
|
| 15 |
+
Epoch 9: 100%|████████████████████████████████████████████████████████████████| 26653/26653 [6:28:32<00:00, 1.14it/s, v_num=n0v3]BLEU-2 score: 26.653377377338177
|
| 16 |
+
BLEU-4 score: 20.796051979558282████████████████████████████████████████████████████████████████| 313/313 [23:19<00:00, 0.22it/s]
|
| 17 |
+
/nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
| 18 |
+
sd = self.module.state_dict(destination, prefix, keep_vars)
|
| 19 |
+
20000it [01:34, 212.73it/s]
|
| 20 |
+
20000it [00:35, 568.59it/s]
|
| 21 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
|
| 22 |
+
Average Meteor score: 27.78854434762939
|
| 23 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
|
| 24 |
+
ROUGE score:
|
| 25 |
+
rouge1: 32.64128335687479
|
| 26 |
+
rouge2: 20.424108167541117
|
| 27 |
+
rougeL: 27.292150023395635
|
| 28 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu4', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
|
| 29 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_1', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
|
| 30 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
|
| 31 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_l', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
|
| 32 |
+
/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/meteor_score', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
|
| 33 |
+
Epoch 9: 100%|████████████████████████████████████████████████████████████████| 26653/26653 [7:22:40<00:00, 1.00it/s, v_num=n0v3]
|
| 34 |
+
|
| 35 |
+
`Trainer.fit` stopped: `max_epochs=10` reached.
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/requirements.txt
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gitdb==4.0.12
|
| 2 |
+
smmap==5.0.2
|
| 3 |
+
wcwidth==0.2.13
|
| 4 |
+
streamlit==1.45.1
|
| 5 |
+
antlr4-python3-runtime==4.9.3
|
| 6 |
+
MarkupSafe==3.0.2
|
| 7 |
+
markdown-it-py==3.0.0
|
| 8 |
+
PyYAML==6.0.2
|
| 9 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 10 |
+
text-unidecode==1.3
|
| 11 |
+
msgpack==1.1.0
|
| 12 |
+
pillow==11.2.1
|
| 13 |
+
wrapt==1.17.2
|
| 14 |
+
tifffile==2025.5.10
|
| 15 |
+
nvidia-curand-cu12==10.3.5.147
|
| 16 |
+
networkx==3.4.2
|
| 17 |
+
fonttools==4.58.0
|
| 18 |
+
plotly==6.1.1
|
| 19 |
+
matplotlib==3.10.3
|
| 20 |
+
certifi==2025.4.26
|
| 21 |
+
altair==5.5.0
|
| 22 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 23 |
+
ninja==1.11.1.4
|
| 24 |
+
tzdata==2025.2
|
| 25 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 26 |
+
weasel==0.4.1
|
| 27 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 28 |
+
jedi==0.19.2
|
| 29 |
+
GitPython==3.1.44
|
| 30 |
+
pandas==2.2.3
|
| 31 |
+
python-slugify==8.0.4
|
| 32 |
+
omegaconf==2.3.0
|
| 33 |
+
kiwisolver==1.4.8
|
| 34 |
+
tenacity==9.1.2
|
| 35 |
+
pydantic==2.11.5
|
| 36 |
+
async-timeout==5.0.1
|
| 37 |
+
tqdm==4.67.1
|
| 38 |
+
confection==0.1.5
|
| 39 |
+
six==1.17.0
|
| 40 |
+
portalocker==3.1.1
|
| 41 |
+
regex==2024.11.6
|
| 42 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 43 |
+
packaging==24.2
|
| 44 |
+
annotated-types==0.7.0
|
| 45 |
+
salesforce-lavis==1.0.2
|
| 46 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 47 |
+
lightning-utilities==0.14.3
|
| 48 |
+
pytz==2025.2
|
| 49 |
+
smart-open==7.1.0
|
| 50 |
+
cachetools==5.5.2
|
| 51 |
+
nltk==3.9.1
|
| 52 |
+
torchmetrics==1.7.1
|
| 53 |
+
pexpect==4.9.0
|
| 54 |
+
jsonschema-specifications==2025.4.1
|
| 55 |
+
Jinja2==3.1.6
|
| 56 |
+
hjson==3.1.0
|
| 57 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 58 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 59 |
+
decord==0.6.0
|
| 60 |
+
joblib==1.5.1
|
| 61 |
+
kaggle==1.7.4.5
|
| 62 |
+
psutil==7.0.0
|
| 63 |
+
absl-py==2.2.2
|
| 64 |
+
Pygments==2.19.1
|
| 65 |
+
idna==3.10
|
| 66 |
+
aiohappyeyeballs==2.6.1
|
| 67 |
+
tornado==6.5.1
|
| 68 |
+
cycler==0.12.1
|
| 69 |
+
deepspeed==0.16.10+b666844f
|
| 70 |
+
torchvision==0.21.0
|
| 71 |
+
exceptiongroup==1.3.0
|
| 72 |
+
cfgv==3.4.0
|
| 73 |
+
py-cpuinfo==9.0.0
|
| 74 |
+
webdataset==0.2.111
|
| 75 |
+
murmurhash==1.0.13
|
| 76 |
+
asttokens==3.0.0
|
| 77 |
+
spacy==3.8.7
|
| 78 |
+
blinker==1.9.0
|
| 79 |
+
python-dateutil==2.9.0.post0
|
| 80 |
+
prompt_toolkit==3.0.51
|
| 81 |
+
referencing==0.36.2
|
| 82 |
+
contourpy==1.3.2
|
| 83 |
+
mpmath==1.3.0
|
| 84 |
+
thinc==8.3.6
|
| 85 |
+
pycocotools==2.0.8
|
| 86 |
+
python-magic==0.4.27
|
| 87 |
+
fairscale==0.4.4
|
| 88 |
+
nodeenv==1.9.1
|
| 89 |
+
identify==2.6.12
|
| 90 |
+
ftfy==6.3.1
|
| 91 |
+
spacy-legacy==3.0.12
|
| 92 |
+
cymem==2.0.11
|
| 93 |
+
typing-inspection==0.4.1
|
| 94 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 95 |
+
filelock==3.18.0
|
| 96 |
+
language_data==1.3.0
|
| 97 |
+
iopath==0.1.10
|
| 98 |
+
pre_commit==4.2.0
|
| 99 |
+
toml==0.10.2
|
| 100 |
+
lazy_loader==0.4
|
| 101 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 102 |
+
nvidia-nvtx-cu12==12.4.127
|
| 103 |
+
rouge_score==0.1.2
|
| 104 |
+
pycocoevalcap==1.2
|
| 105 |
+
pyparsing==3.2.3
|
| 106 |
+
mdurl==0.1.2
|
| 107 |
+
pure_eval==0.2.3
|
| 108 |
+
ipython==8.36.0
|
| 109 |
+
langcodes==3.5.0
|
| 110 |
+
distlib==0.3.9
|
| 111 |
+
pydeck==0.9.1
|
| 112 |
+
traitlets==5.14.3
|
| 113 |
+
decorator==5.2.1
|
| 114 |
+
requests==2.32.3
|
| 115 |
+
pydantic_core==2.33.2
|
| 116 |
+
matplotlib-inline==0.1.7
|
| 117 |
+
hf-xet==1.1.2
|
| 118 |
+
opendatasets==0.1.22
|
| 119 |
+
attrs==25.3.0
|
| 120 |
+
urllib3==2.4.0
|
| 121 |
+
typing_extensions==4.13.2
|
| 122 |
+
bleach==6.2.0
|
| 123 |
+
rich==14.0.0
|
| 124 |
+
imageio==2.37.0
|
| 125 |
+
yarl==1.20.0
|
| 126 |
+
platformdirs==4.3.8
|
| 127 |
+
multidict==6.4.4
|
| 128 |
+
catalogue==2.0.10
|
| 129 |
+
wasabi==1.1.3
|
| 130 |
+
scikit-image==0.25.2
|
| 131 |
+
blis==1.3.0
|
| 132 |
+
pyarrow==20.0.0
|
| 133 |
+
parso==0.8.4
|
| 134 |
+
rpds-py==0.25.1
|
| 135 |
+
opencv-python-headless==4.5.5.64
|
| 136 |
+
braceexpand==0.1.7
|
| 137 |
+
frozenlist==1.6.0
|
| 138 |
+
numpy==2.2.6
|
| 139 |
+
cloudpathlib==0.21.1
|
| 140 |
+
srsly==2.5.1
|
| 141 |
+
webencodings==0.5.1
|
| 142 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 143 |
+
einops==0.8.1
|
| 144 |
+
setuptools==78.1.1
|
| 145 |
+
nvidia-nccl-cu12==2.21.5
|
| 146 |
+
ptyprocess==0.7.0
|
| 147 |
+
torch==2.6.0
|
| 148 |
+
scipy==1.15.3
|
| 149 |
+
nvidia-ml-py==12.575.51
|
| 150 |
+
aiosignal==1.3.2
|
| 151 |
+
virtualenv==20.31.2
|
| 152 |
+
protobuf==6.31.0
|
| 153 |
+
contexttimer==0.3.3
|
| 154 |
+
marisa-trie==1.2.1
|
| 155 |
+
shellingham==1.5.4
|
| 156 |
+
charset-normalizer==3.4.2
|
| 157 |
+
propcache==0.3.1
|
| 158 |
+
executing==2.2.0
|
| 159 |
+
pytorch-lightning==2.5.1.post0
|
| 160 |
+
stack-data==0.6.3
|
| 161 |
+
sentencepiece==0.2.0
|
| 162 |
+
sympy==1.13.1
|
| 163 |
+
wheel==0.45.1
|
| 164 |
+
safetensors==0.5.3
|
| 165 |
+
triton==3.2.0
|
| 166 |
+
watchdog==6.0.0
|
| 167 |
+
spacy-loggers==1.0.5
|
| 168 |
+
timm==0.4.12
|
| 169 |
+
docker-pycreds==0.4.0
|
| 170 |
+
setproctitle==1.3.6
|
| 171 |
+
jmespath==0.10.0
|
| 172 |
+
pycryptodome==3.23.0
|
| 173 |
+
opendelta==0.3.2
|
| 174 |
+
aliyun-python-sdk-core==2.16.0
|
| 175 |
+
dill==0.3.8
|
| 176 |
+
xxhash==3.5.0
|
| 177 |
+
crcmod==1.7
|
| 178 |
+
aiohttp==3.12.2
|
| 179 |
+
sentry-sdk==2.29.1
|
| 180 |
+
huggingface-hub==0.32.1
|
| 181 |
+
jaraco.functools==4.1.0
|
| 182 |
+
pathlib==1.0.1
|
| 183 |
+
multiprocess==0.70.16
|
| 184 |
+
flash-attn==2.7.1.post1
|
| 185 |
+
jsonschema==4.24.0
|
| 186 |
+
datasets==3.6.0
|
| 187 |
+
cffi==1.17.1
|
| 188 |
+
pycparser==2.22
|
| 189 |
+
fsspec==2025.3.0
|
| 190 |
+
more-itertools==10.7.0
|
| 191 |
+
cryptography==45.0.3
|
| 192 |
+
tokenizers==0.21.1
|
| 193 |
+
cheroot==10.0.1
|
| 194 |
+
pip==25.1.1
|
| 195 |
+
preshed==3.0.10
|
| 196 |
+
transformers==4.52.3
|
| 197 |
+
oss2==2.15.0
|
| 198 |
+
yacs==0.1.8
|
| 199 |
+
wandb==0.19.11
|
| 200 |
+
bigmodelvis==0.0.1
|
| 201 |
+
web.py==0.62
|
| 202 |
+
opencv-python==4.11.0.86
|
| 203 |
+
threadpoolctl==3.6.0
|
| 204 |
+
typer==0.16.0
|
| 205 |
+
narwhals==1.41.0
|
| 206 |
+
delta-center-client==0.0.4
|
| 207 |
+
aliyun-python-sdk-kms==2.16.5
|
| 208 |
+
click==8.2.1
|
| 209 |
+
scikit-learn==1.6.1
|
| 210 |
+
jaraco.text==3.12.1
|
| 211 |
+
autocommand==2.2.2
|
| 212 |
+
packaging==24.2
|
| 213 |
+
jaraco.context==5.3.0
|
| 214 |
+
tomli==2.0.1
|
| 215 |
+
typeguard==4.3.0
|
| 216 |
+
zipp==3.19.2
|
| 217 |
+
backports.tarfile==1.2.0
|
| 218 |
+
typing_extensions==4.12.2
|
| 219 |
+
jaraco.collections==5.1.0
|
| 220 |
+
inflect==7.3.1
|
| 221 |
+
more-itertools==10.3.0
|
| 222 |
+
jaraco.functools==4.0.1
|
| 223 |
+
importlib_metadata==8.0.0
|
| 224 |
+
platformdirs==4.2.2
|
| 225 |
+
wheel==0.45.1
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-07-06T21:32:22.534976Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--devices",
|
| 7 |
+
"0,1,2,3,4,5,6,7",
|
| 8 |
+
"--mode",
|
| 9 |
+
"train",
|
| 10 |
+
"--filename",
|
| 11 |
+
"stage2_07070513_2datasets_construct",
|
| 12 |
+
"--num_query_token",
|
| 13 |
+
"8",
|
| 14 |
+
"--save_every_n_epochs",
|
| 15 |
+
"1",
|
| 16 |
+
"--max_epochs",
|
| 17 |
+
"10",
|
| 18 |
+
"--batch_size",
|
| 19 |
+
"4",
|
| 20 |
+
"--precision",
|
| 21 |
+
"bf16-mixed",
|
| 22 |
+
"--num_workers",
|
| 23 |
+
"8",
|
| 24 |
+
"--plm_model",
|
| 25 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
|
| 26 |
+
"--bert_name",
|
| 27 |
+
"/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
|
| 28 |
+
"--llm_name",
|
| 29 |
+
"/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 30 |
+
"--llm_tune",
|
| 31 |
+
"mid_lora",
|
| 32 |
+
"--mix_dataset",
|
| 33 |
+
"--stage1_path",
|
| 34 |
+
"/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt",
|
| 35 |
+
"--use_wandb_logger"
|
| 36 |
+
],
|
| 37 |
+
"program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py",
|
| 38 |
+
"codePath": "stage2.py",
|
| 39 |
+
"email": "gia0603yucca@gmail.com",
|
| 40 |
+
"root": "./all_checkpoints/stage2_07070513_2datasets_construct/",
|
| 41 |
+
"host": "dsw-251511-69b5b47496-4bcxh",
|
| 42 |
+
"executable": "/root/miniconda3/envs/protT3/bin/python",
|
| 43 |
+
"codePathLocal": "stage2.py",
|
| 44 |
+
"cpu_count": 64,
|
| 45 |
+
"cpu_count_logical": 64,
|
| 46 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 47 |
+
"gpu_count": 8,
|
| 48 |
+
"disk": {
|
| 49 |
+
"/": {
|
| 50 |
+
"total": "1623302262784",
|
| 51 |
+
"used": "1260400640"
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"memory": {
|
| 55 |
+
"total": "549755813888"
|
| 56 |
+
},
|
| 57 |
+
"cpu": {
|
| 58 |
+
"count": 64,
|
| 59 |
+
"countLogical": 64
|
| 60 |
+
},
|
| 61 |
+
"gpu_nvidia": [
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 64 |
+
"memoryTotal": "85198045184",
|
| 65 |
+
"architecture": "Ampere"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 69 |
+
"memoryTotal": "85198045184",
|
| 70 |
+
"architecture": "Ampere"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 74 |
+
"memoryTotal": "85198045184",
|
| 75 |
+
"architecture": "Ampere"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 79 |
+
"memoryTotal": "85198045184",
|
| 80 |
+
"architecture": "Ampere"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 84 |
+
"memoryTotal": "85198045184",
|
| 85 |
+
"architecture": "Ampere"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 89 |
+
"memoryTotal": "85198045184",
|
| 90 |
+
"architecture": "Ampere"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 94 |
+
"memoryTotal": "85198045184",
|
| 95 |
+
"architecture": "Ampere"
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 99 |
+
"memoryTotal": "85198045184",
|
| 100 |
+
"architecture": "Ampere"
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
"cudaVersion": "12.1"
|
| 104 |
+
}
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_runtime":238677.754852379,"trainer/global_step":266529,"dataloader0/val loss/dataloader_idx_0":0.3741031885147095,"dataset0/acc":0,"dataset0/meteor_score":27.788543701171875,"_wandb":{"runtime":238690},"dataset0/bleu4":20.796052932739258,"dataset0/rouge_2":20.424108505249023,"lr":1.2202456673549023e-05,"_timestamp":1.7520762202895813e+09,"dataloader2/val loss/dataloader_idx_2":0.2067195624113083,"dataset0/rouge_1":32.64128494262695,"loss":0.114682637155056,"epoch":9,"dataset0/bleu2":26.653377532958984,"dataset0/rouge_l":27.292150497436523,"_step":5339}
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-07-07T05:32:22.544190733+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-07-07T05:32:23.597067843+08:00","level":"INFO","msg":"created new stream","id":"9cjzn0v3"}
|
| 3 |
+
{"time":"2025-07-07T05:32:23.59711309+08:00","level":"INFO","msg":"stream: started","id":"9cjzn0v3"}
|
| 4 |
+
{"time":"2025-07-07T05:32:23.59715533+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9cjzn0v3"}
|
| 5 |
+
{"time":"2025-07-07T05:32:23.597176058+08:00","level":"INFO","msg":"handler: started","stream_id":"9cjzn0v3"}
|
| 6 |
+
{"time":"2025-07-07T05:32:23.597249736+08:00","level":"INFO","msg":"sender: started","stream_id":"9cjzn0v3"}
|
| 7 |
+
{"time":"2025-07-07T05:32:24.815832776+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-07-07T16:23:26.191588391+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:46082->172.67.193.61:443: read: connection timed out"}
|
| 9 |
+
{"time":"2025-07-08T07:36:01.662714436+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
|
| 10 |
+
{"time":"2025-07-08T07:39:35.510926561+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
|
| 11 |
+
{"time":"2025-07-09T00:01:13.718163538+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:52128->172.67.193.61:443: read: connection reset by peer"}
|
| 12 |
+
{"time":"2025-07-09T00:04:13.715227056+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:56708->104.21.20.172:443: read: connection reset by peer"}
|
| 13 |
+
{"time":"2025-07-09T00:41:59.079495986+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:39968->172.67.193.61:443: read: connection reset by peer"}
|
| 14 |
+
{"time":"2025-07-09T00:50:28.436723591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 15 |
+
{"time":"2025-07-09T01:04:28.736382048+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:48068->104.21.20.172:443: read: connection reset by peer"}
|
| 16 |
+
{"time":"2025-07-09T01:36:13.71400828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:33646->172.67.193.61:443: read: connection reset by peer"}
|
| 17 |
+
{"time":"2025-07-09T06:33:13.899246984+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
|
| 18 |
+
{"time":"2025-07-09T14:42:33.327607005+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:36168->104.21.20.172:443: read: connection timed out"}
|
| 19 |
+
{"time":"2025-07-09T22:35:09.035751509+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
|
| 20 |
+
{"time":"2025-07-09T23:18:37.03957561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56104->172.67.193.61:443: read: connection timed out"}
|
| 21 |
+
{"time":"2025-07-09T23:35:24.650683333+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56892->172.67.193.61:443: read: connection reset by peer"}
|
| 22 |
+
{"time":"2025-07-09T23:50:32.561736786+08:00","level":"INFO","msg":"stream: closing","id":"9cjzn0v3"}
|
| 23 |
+
{"time":"2025-07-09T23:50:32.56179589+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 24 |
+
{"time":"2025-07-09T23:50:32.564495033+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 25 |
+
{"time":"2025-07-09T23:50:38.466118847+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 26 |
+
{"time":"2025-07-09T23:50:40.207050581+08:00","level":"INFO","msg":"handler: closed","stream_id":"9cjzn0v3"}
|
| 27 |
+
{"time":"2025-07-09T23:50:40.207095276+08:00","level":"INFO","msg":"sender: closed","stream_id":"9cjzn0v3"}
|
| 28 |
+
{"time":"2025-07-09T23:50:40.207092571+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"9cjzn0v3"}
|
| 29 |
+
{"time":"2025-07-09T23:50:40.211547321+08:00","level":"INFO","msg":"stream: closed","id":"9cjzn0v3"}
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
|
| 2 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Configure stats pid to 9598
|
| 3 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
|
| 5 |
+
2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from environment variables
|
| 6 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log
|
| 7 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log
|
| 8 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-07-07 05:32:22,529 INFO MainThread:9598 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-07-07 05:32:22,531 INFO MainThread:9598 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-07-07 05:32:22,535 INFO MainThread:9598 [wandb_init.py:init():1005] updated telemetry
|
| 16 |
+
2025-07-07 05:32:22,540 INFO MainThread:9598 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-07-07 05:32:24,807 INFO MainThread:9598 [wandb_init.py:init():1104] starting run threads in backend
|
| 18 |
+
2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_console_start():2573] atexit reg
|
| 19 |
+
2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_redirect():2421] redirect: wrap_raw
|
| 20 |
+
2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2490] Wrapping output streams.
|
| 21 |
+
2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2513] Redirects installed.
|
| 22 |
+
2025-07-07 05:32:24,961 INFO MainThread:9598 [wandb_init.py:init():1150] run started, returning control to user process
|
| 23 |
+
2025-07-07 05:32:33,644 INFO MainThread:9598 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070513_2datasets_construct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 1, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 4, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
|
| 24 |
+
2025-07-09 23:50:32,550 INFO MsgRouterThr:9598 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
|
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/run-9cjzn0v3.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6cffb771629ea66d100de0be9a1ef1c3f9599c478045c824d755e2ea04fe379
|
| 3 |
+
size 199737973
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:246a0d8dbc7414f986a333879b13c36671f129b3b117d7b4066f3928cb35bc99
|
| 3 |
+
size 156403632
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2c9fc89299dca46abb0b74a86e998a1f7b2026cca0ce2dcf590da1a68df2186
|
| 3 |
+
size 156402992
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b9d65b0b283d3d437c8eb4955242a571b50d7503bb4d371742052fe466db312
|
| 3 |
+
size 156403376
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:735a857a56751f34bbcf2516fd6b038ff0617777b78e86a12ba3d5d4181e8119
|
| 3 |
+
size 156403120
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e14097721553b19f34683b33ddb67fa3d670e035d7260159b9e97aac7d7e851c
|
| 3 |
+
size 156402416
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe993f0a43264f371f9feef67c3aaf51480074bf4512a075fbcf26c11b405c6d
|
| 3 |
+
size 156403696
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:094d7605822331acc11f1cf58534bc7d48c4a66bd686ba9749042f97fe8def0e
|
| 3 |
+
size 156402992
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79ebf1ed60ade77ba2453feda74fbc5d71c1d55b9e7d67f1398ea6e6e7d45041
|
| 3 |
+
size 156417904
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e5f1a07851a422c60545e9fa30da0dc892ea9ef21aa34e05d2e11791664d89b6
|
| 3 |
+
size 208795320
|
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/converted.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f74c2b94f722a6ce4ebc2444d8247a103295f9673fbfab2529712ca70294557
|
| 3 |
+
size 417200548
|