yuccaaa commited on
Commit
a17e46e
·
verified ·
1 Parent(s): bbcacd6

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  3. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  4. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  5. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  6. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  7. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  8. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  9. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  10. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  11. all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/converted.ckpt +3 -0
  12. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  13. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  14. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  15. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  16. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  17. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  18. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  19. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  20. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  21. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/latest +3 -0
  22. all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/zero_to_fp32.py +3 -0
  23. all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug-internal.log +29 -0
  24. all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug.log +24 -0
  25. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/config.yaml +168 -0
  26. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/output.log +229 -0
  27. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/requirements.txt +225 -0
  28. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-metadata.json +104 -0
  29. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-summary.json +1 -0
  30. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-internal.log +10 -0
  31. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug.log +24 -0
  32. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/run-615z4bme.wandb +3 -0
  33. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/config.yaml +222 -0
  34. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/output.log +35 -0
  35. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/requirements.txt +225 -0
  36. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-metadata.json +104 -0
  37. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-summary.json +1 -0
  38. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log +29 -0
  39. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log +24 -0
  40. all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/run-9cjzn0v3.wandb +3 -0
  41. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  42. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  43. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  44. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  45. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  46. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  47. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  48. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  49. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  50. all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/converted.ckpt +3 -0
.gitattributes CHANGED
@@ -36,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb filter=lfs diff=lfs merge=lfs -text
37
  all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/run-6bkqzmou.wandb filter=lfs diff=lfs merge=lfs -text
38
  all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/run-gtrtcbb9.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
36
  all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/run-vgvxxzqc.wandb filter=lfs diff=lfs merge=lfs -text
37
  all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/run-6bkqzmou.wandb filter=lfs diff=lfs merge=lfs -text
38
  all_checkpoints/stage2_07070337_2datasets_noconstruct/wandb/run-20250707_041231-gtrtcbb9/run-gtrtcbb9.wandb filter=lfs diff=lfs merge=lfs -text
39
+ all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/run-615z4bme.wandb filter=lfs diff=lfs merge=lfs -text
40
+ all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/run-9cjzn0v3.wandb filter=lfs diff=lfs merge=lfs -text
41
+ all_checkpoints/stage2_07301646_2datasets_construct/wandb/run-20250730_175623-pbf2bxo6/run-pbf2bxo6.wandb filter=lfs diff=lfs merge=lfs -text
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab1a6ea1ac55ce9616532e761371379f0cb306bbf194e29d1a44cabe01dd4e3a
3
+ size 156403632
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c0aa50226009f0d8a6a23d3d6e98e194a4f2a944d95a7f46bcde8d3aebd98e5
3
+ size 156402992
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd423ebd707139e3d47d635ea2251cb7f655b5f24650085a7abf71a1f26f3d05
3
+ size 156403376
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850b33a5acf35f5909aae8ad5672d7895c82f2024dc5b08d95b9ff338e08e519
3
+ size 156403120
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:942855deca23d990f0db7f0ce36aafba768e914d42db3b7f38810250bf9e2c25
3
+ size 156402416
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14691a96f4b8d119ff12b6282ae802455fbec50f06f044bd597970f8331aca5
3
+ size 156403696
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60586788404b4008997d9fc2cee3b07a9a3f28324c0b0390bf54ef165acd2580
3
+ size 156402992
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b17dec7f7d4ac6aa06b05dfc462965b64f54028ed539e422ac2695da0c87828
3
+ size 156417904
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbc87c01c9bbe626db3b00ce1ecc91c91503d810e852467d60441c897aa405eb
3
+ size 208795192
all_checkpoints/stage2_07070513_2datasets_construct/epoch=09.ckpt/converted.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb1faf00f4930709a0ce83e6276411772ea0ce8357a6dde45856eb25d7ba33b7
3
+ size 417200356
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab1a6ea1ac55ce9616532e761371379f0cb306bbf194e29d1a44cabe01dd4e3a
3
+ size 156403632
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c0aa50226009f0d8a6a23d3d6e98e194a4f2a944d95a7f46bcde8d3aebd98e5
3
+ size 156402992
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd423ebd707139e3d47d635ea2251cb7f655b5f24650085a7abf71a1f26f3d05
3
+ size 156403376
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850b33a5acf35f5909aae8ad5672d7895c82f2024dc5b08d95b9ff338e08e519
3
+ size 156403120
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:942855deca23d990f0db7f0ce36aafba768e914d42db3b7f38810250bf9e2c25
3
+ size 156402416
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14691a96f4b8d119ff12b6282ae802455fbec50f06f044bd597970f8331aca5
3
+ size 156403696
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60586788404b4008997d9fc2cee3b07a9a3f28324c0b0390bf54ef165acd2580
3
+ size 156402992
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b17dec7f7d4ac6aa06b05dfc462965b64f54028ed539e422ac2695da0c87828
3
+ size 156417904
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbc87c01c9bbe626db3b00ce1ecc91c91503d810e852467d60441c897aa405eb
3
+ size 208795192
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/latest ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef
3
+ size 10
all_checkpoints/stage2_07070513_2datasets_construct/last.ckpt/zero_to_fp32.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46497565ccf2b4a8b1f6f18c8341042f3749605a94335c81f69df1bd268af64f
3
+ size 33272
all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug-internal.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-07T05:32:22.544190733+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-core.log"}
2
+ {"time":"2025-07-07T05:32:23.597067843+08:00","level":"INFO","msg":"created new stream","id":"9cjzn0v3"}
3
+ {"time":"2025-07-07T05:32:23.59711309+08:00","level":"INFO","msg":"stream: started","id":"9cjzn0v3"}
4
+ {"time":"2025-07-07T05:32:23.59715533+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9cjzn0v3"}
5
+ {"time":"2025-07-07T05:32:23.597176058+08:00","level":"INFO","msg":"handler: started","stream_id":"9cjzn0v3"}
6
+ {"time":"2025-07-07T05:32:23.597249736+08:00","level":"INFO","msg":"sender: started","stream_id":"9cjzn0v3"}
7
+ {"time":"2025-07-07T05:32:24.815832776+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-07-07T16:23:26.191588391+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:46082->172.67.193.61:443: read: connection timed out"}
9
+ {"time":"2025-07-08T07:36:01.662714436+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
10
+ {"time":"2025-07-08T07:39:35.510926561+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
11
+ {"time":"2025-07-09T00:01:13.718163538+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:52128->172.67.193.61:443: read: connection reset by peer"}
12
+ {"time":"2025-07-09T00:04:13.715227056+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:56708->104.21.20.172:443: read: connection reset by peer"}
13
+ {"time":"2025-07-09T00:41:59.079495986+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:39968->172.67.193.61:443: read: connection reset by peer"}
14
+ {"time":"2025-07-09T00:50:28.436723591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
15
+ {"time":"2025-07-09T01:04:28.736382048+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:48068->104.21.20.172:443: read: connection reset by peer"}
16
+ {"time":"2025-07-09T01:36:13.71400828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:33646->172.67.193.61:443: read: connection reset by peer"}
17
+ {"time":"2025-07-09T06:33:13.899246984+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
18
+ {"time":"2025-07-09T14:42:33.327607005+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:36168->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-07-09T22:35:09.035751509+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
20
+ {"time":"2025-07-09T23:18:37.03957561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56104->172.67.193.61:443: read: connection timed out"}
21
+ {"time":"2025-07-09T23:35:24.650683333+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56892->172.67.193.61:443: read: connection reset by peer"}
22
+ {"time":"2025-07-09T23:50:32.561736786+08:00","level":"INFO","msg":"stream: closing","id":"9cjzn0v3"}
23
+ {"time":"2025-07-09T23:50:32.56179589+08:00","level":"INFO","msg":"Stopping system monitor"}
24
+ {"time":"2025-07-09T23:50:32.564495033+08:00","level":"INFO","msg":"Stopped system monitor"}
25
+ {"time":"2025-07-09T23:50:38.466118847+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
26
+ {"time":"2025-07-09T23:50:40.207050581+08:00","level":"INFO","msg":"handler: closed","stream_id":"9cjzn0v3"}
27
+ {"time":"2025-07-09T23:50:40.207095276+08:00","level":"INFO","msg":"sender: closed","stream_id":"9cjzn0v3"}
28
+ {"time":"2025-07-09T23:50:40.207092571+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"9cjzn0v3"}
29
+ {"time":"2025-07-09T23:50:40.211547321+08:00","level":"INFO","msg":"stream: closed","id":"9cjzn0v3"}
all_checkpoints/stage2_07070513_2datasets_construct/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Configure stats pid to 9598
3
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log
7
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log
8
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():893] starting backend
12
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-07 05:32:22,529 INFO MainThread:9598 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-07 05:32:22,531 INFO MainThread:9598 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-07 05:32:22,535 INFO MainThread:9598 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-07 05:32:22,540 INFO MainThread:9598 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-07 05:32:24,807 INFO MainThread:9598 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-07 05:32:24,961 INFO MainThread:9598 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-07-07 05:32:33,644 INFO MainThread:9598 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070513_2datasets_construct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 1, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 4, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
24
+ 2025-07-09 23:50:32,550 INFO MsgRouterThr:9598 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/config.yaml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ - "1": loss
10
+ "5": 1
11
+ "6":
12
+ - 1
13
+ - 3
14
+ "7": []
15
+ - "1": lr
16
+ "5": 1
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": epoch
22
+ "5": 1
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ python_version: 3.10.0
28
+ t:
29
+ "1":
30
+ - 1
31
+ - 5
32
+ - 9
33
+ - 11
34
+ - 33
35
+ - 41
36
+ - 49
37
+ - 53
38
+ - 55
39
+ - 63
40
+ - 103
41
+ "2":
42
+ - 1
43
+ - 5
44
+ - 9
45
+ - 11
46
+ - 33
47
+ - 41
48
+ - 49
49
+ - 53
50
+ - 55
51
+ - 63
52
+ - 103
53
+ "3":
54
+ - 7
55
+ - 23
56
+ - 55
57
+ - 66
58
+ "4": 3.10.0
59
+ "5": 0.19.11
60
+ "6": 4.52.3
61
+ "8":
62
+ - 5
63
+ "12": 0.19.11
64
+ "13": linux-x86_64
65
+ a_max_len:
66
+ value: 36
67
+ accelerator:
68
+ value: gpu
69
+ accumulate_grad_batches:
70
+ value: 1
71
+ batch_size:
72
+ value: 8
73
+ bert_name:
74
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
75
+ caption_eval_epoch:
76
+ value: 10
77
+ check_val_every_n_epoch:
78
+ value: 1
79
+ cross_attention_freq:
80
+ value: 2
81
+ devices:
82
+ value: 0,1,2,3,4,5,6,7
83
+ do_sample:
84
+ value: false
85
+ enable_flash:
86
+ value: false
87
+ enbale_gradient_checkpointing:
88
+ value: false
89
+ filename:
90
+ value: stage2_07070513_2datasets_construct
91
+ filter_side_qa:
92
+ value: false
93
+ inference_batch_size:
94
+ value: 4
95
+ init_checkpoint:
96
+ value: ""
97
+ init_lr:
98
+ value: 0.0001
99
+ llm_name:
100
+ value: /oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged
101
+ llm_tune:
102
+ value: mid_lora
103
+ lora_alpha:
104
+ value: 16
105
+ lora_dropout:
106
+ value: 0.1
107
+ lora_r:
108
+ value: 8
109
+ lr_decay_rate:
110
+ value: 0.9
111
+ max_epochs:
112
+ value: 10
113
+ max_inference_len:
114
+ value: 128
115
+ min_inference_len:
116
+ value: 1
117
+ min_lr:
118
+ value: 1e-05
119
+ mix_dataset:
120
+ value: true
121
+ mode:
122
+ value: train
123
+ num_beams:
124
+ value: 5
125
+ num_query_token:
126
+ value: 8
127
+ num_workers:
128
+ value: 8
129
+ peft_config:
130
+ value: ""
131
+ peft_dir:
132
+ value: ""
133
+ plm_model:
134
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
135
+ plm_tune:
136
+ value: freeze
137
+ precision:
138
+ value: bf16-mixed
139
+ prompt:
140
+ value: 'The protein has the following properties: '
141
+ prot_max_len:
142
+ value: 1024
143
+ q_max_len:
144
+ value: 29
145
+ root:
146
+ value: data
147
+ save_every_n_epochs:
148
+ value: 1
149
+ scheduler:
150
+ value: linear_warmup_cosine_lr
151
+ seed:
152
+ value: 42
153
+ stage1_path:
154
+ value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt
155
+ stage2_path:
156
+ value: ""
157
+ strategy:
158
+ value: deepspeed
159
+ text_max_len:
160
+ value: 1024
161
+ use_wandb_logger:
162
+ value: true
163
+ warmup_lr:
164
+ value: 1e-06
165
+ warmup_steps:
166
+ value: 1000
167
+ weight_decay:
168
+ value: 0.05
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/output.log ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070513_2datasets_construct exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ -------------------------------------------
7
+ 0 | blip2 | Blip2OPT | 7.9 B | train
8
+ -------------------------------------------
9
+ 104 M Trainable params
10
+ 7.8 B Non-trainable params
11
+ 7.9 B Total params
12
+ 31,459.025Total estimated model params size (MB)
13
+ 174 Modules in train mode
14
+ 1203 Modules in eval mode
15
+ Epoch 0: 1%|▌ | 138/13326 [03:46<6:00:39, 0.61it/s, v_num=4bme]
16
+
17
+ Detected KeyboardInterrupt, attempting graceful shutdown ...
18
+ Traceback (most recent call last):
19
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
20
+ return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
21
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
22
+ return function(*args, **kwargs)
23
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
24
+ self._run(model, ckpt_path=ckpt_path)
25
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
26
+ results = self._run_stage()
27
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage
28
+ self.fit_loop.run()
29
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run
30
+ self.advance()
31
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance
32
+ self.epoch_loop.run(self._data_fetcher)
33
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run
34
+ self.advance(data_fetcher)
35
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance
36
+ batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
37
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run
38
+ self._optimizer_step(batch_idx, closure)
39
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step
40
+ call._call_lightning_module_hook(
41
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook
42
+ output = fn(*args, **kwargs)
43
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step
44
+ optimizer.step(closure=optimizer_closure)
45
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step
46
+ step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
47
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step
48
+ optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
49
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step
50
+ return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
51
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step
52
+ closure_result = closure()
53
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__
54
+ self._result = self.closure(*args, **kwargs)
55
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
56
+ return func(*args, **kwargs)
57
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure
58
+ step_output = self._step_fn()
59
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step
60
+ training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
61
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
62
+ output = fn(*args, **kwargs)
63
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step
64
+ return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
65
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
66
+ wrapper_output = wrapper_module(*args, **kwargs)
67
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
68
+ return self._call_impl(*args, **kwargs)
69
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
70
+ return forward_call(*args, **kwargs)
71
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
72
+ ret_val = func(*args, **kwargs)
73
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
74
+ loss = self.module(*inputs, **kwargs)
75
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
76
+ return self._call_impl(*args, **kwargs)
77
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
78
+ return inner()
79
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
80
+ result = forward_call(*args, **kwargs)
81
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
82
+ out = method(*_args, **_kwargs)
83
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 284, in training_step
84
+ self.log("loss", float(loss), batch_size=batch_size, sync_dist=True)
85
+ KeyboardInterrupt
86
+
87
+ During handling of the above exception, another exception occurred:
88
+
89
+ Traceback (most recent call last):
90
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 131, in <module>
91
+ main(get_args())
92
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main
93
+ trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path)
94
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
95
+ call._call_and_handle_interrupt(
96
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 61, in _call_and_handle_interrupt
97
+ trainer._teardown()
98
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _teardown
99
+ self.strategy.teardown()
100
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 419, in teardown
101
+ super().teardown()
102
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/parallel.py", line 134, in teardown
103
+ super().teardown()
104
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 536, in teardown
105
+ self.lightning_module.cpu()
106
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lightning_fabric/utilities/device_dtype_mixin.py", line 82, in cpu
107
+ return super().cpu()
108
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in cpu
109
+ return self._apply(lambda t: t.cpu())
110
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
111
+ module._apply(fn)
112
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
113
+ module._apply(fn)
114
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
115
+ module._apply(fn)
116
+ [Previous line repeated 4 more times]
117
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 930, in _apply
118
+ param_applied = fn(param)
119
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in <lambda>
120
+ return self._apply(lambda t: t.cpu())
121
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
122
+ _error_if_any_worker_fails()
123
+ RuntimeError: DataLoader worker (pid 8028) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.
124
+ [rank0]: Traceback (most recent call last):
125
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
126
+ [rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
127
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
128
+ [rank0]: return function(*args, **kwargs)
129
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
130
+ [rank0]: self._run(model, ckpt_path=ckpt_path)
131
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
132
+ [rank0]: results = self._run_stage()
133
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage
134
+ [rank0]: self.fit_loop.run()
135
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run
136
+ [rank0]: self.advance()
137
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance
138
+ [rank0]: self.epoch_loop.run(self._data_fetcher)
139
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run
140
+ [rank0]: self.advance(data_fetcher)
141
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance
142
+ [rank0]: batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
143
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run
144
+ [rank0]: self._optimizer_step(batch_idx, closure)
145
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step
146
+ [rank0]: call._call_lightning_module_hook(
147
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook
148
+ [rank0]: output = fn(*args, **kwargs)
149
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step
150
+ [rank0]: optimizer.step(closure=optimizer_closure)
151
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step
152
+ [rank0]: step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
153
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step
154
+ [rank0]: optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
155
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step
156
+ [rank0]: return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
157
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 129, in optimizer_step
158
+ [rank0]: closure_result = closure()
159
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__
160
+ [rank0]: self._result = self.closure(*args, **kwargs)
161
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
162
+ [rank0]: return func(*args, **kwargs)
163
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure
164
+ [rank0]: step_output = self._step_fn()
165
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step
166
+ [rank0]: training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
167
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
168
+ [rank0]: output = fn(*args, **kwargs)
169
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step
170
+ [rank0]: return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
171
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
172
+ [rank0]: wrapper_output = wrapper_module(*args, **kwargs)
173
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
174
+ [rank0]: return self._call_impl(*args, **kwargs)
175
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
176
+ [rank0]: return forward_call(*args, **kwargs)
177
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
178
+ [rank0]: ret_val = func(*args, **kwargs)
179
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
180
+ [rank0]: loss = self.module(*inputs, **kwargs)
181
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
182
+ [rank0]: return self._call_impl(*args, **kwargs)
183
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
184
+ [rank0]: return inner()
185
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1793, in inner
186
+ [rank0]: result = forward_call(*args, **kwargs)
187
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
188
+ [rank0]: out = method(*_args, **_kwargs)
189
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage2.py", line 284, in training_step
190
+ [rank0]: self.log("loss", float(loss), batch_size=batch_size, sync_dist=True)
191
+ [rank0]: KeyboardInterrupt
192
+
193
+ [rank0]: During handling of the above exception, another exception occurred:
194
+
195
+ [rank0]: Traceback (most recent call last):
196
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 131, in <module>
197
+ [rank0]: main(get_args())
198
+ [rank0]: File "/nas/shared/kilab/wangyujia/ProtT3/stage2.py", line 93, in main
199
+ [rank0]: trainer.fit(model, datamodule=dm)#, ckpt_path=args.ckpt_path)
200
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
201
+ [rank0]: call._call_and_handle_interrupt(
202
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 61, in _call_and_handle_interrupt
203
+ [rank0]: trainer._teardown()
204
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _teardown
205
+ [rank0]: self.strategy.teardown()
206
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 419, in teardown
207
+ [rank0]: super().teardown()
208
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/parallel.py", line 134, in teardown
209
+ [rank0]: super().teardown()
210
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 536, in teardown
211
+ [rank0]: self.lightning_module.cpu()
212
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lightning_fabric/utilities/device_dtype_mixin.py", line 82, in cpu
213
+ [rank0]: return super().cpu()
214
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in cpu
215
+ [rank0]: return self._apply(lambda t: t.cpu())
216
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
217
+ [rank0]: module._apply(fn)
218
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
219
+ [rank0]: module._apply(fn)
220
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 903, in _apply
221
+ [rank0]: module._apply(fn)
222
+ [rank0]: [Previous line repeated 4 more times]
223
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 930, in _apply
224
+ [rank0]: param_applied = fn(param)
225
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1121, in <lambda>
226
+ [rank0]: return self._apply(lambda t: t.cpu())
227
+ [rank0]: File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py", line 73, in handler
228
+ [rank0]: _error_if_any_worker_fails()
229
+ [rank0]: RuntimeError: DataLoader worker (pid 8028) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gitdb==4.0.12
2
+ smmap==5.0.2
3
+ wcwidth==0.2.13
4
+ streamlit==1.45.1
5
+ antlr4-python3-runtime==4.9.3
6
+ MarkupSafe==3.0.2
7
+ markdown-it-py==3.0.0
8
+ PyYAML==6.0.2
9
+ nvidia-cusolver-cu12==11.6.1.9
10
+ text-unidecode==1.3
11
+ msgpack==1.1.0
12
+ pillow==11.2.1
13
+ wrapt==1.17.2
14
+ tifffile==2025.5.10
15
+ nvidia-curand-cu12==10.3.5.147
16
+ networkx==3.4.2
17
+ fonttools==4.58.0
18
+ plotly==6.1.1
19
+ matplotlib==3.10.3
20
+ certifi==2025.4.26
21
+ altair==5.5.0
22
+ nvidia-cufft-cu12==11.2.1.3
23
+ ninja==1.11.1.4
24
+ tzdata==2025.2
25
+ nvidia-cublas-cu12==12.4.5.8
26
+ weasel==0.4.1
27
+ nvidia-cuda-nvrtc-cu12==12.4.127
28
+ jedi==0.19.2
29
+ GitPython==3.1.44
30
+ pandas==2.2.3
31
+ python-slugify==8.0.4
32
+ omegaconf==2.3.0
33
+ kiwisolver==1.4.8
34
+ tenacity==9.1.2
35
+ pydantic==2.11.5
36
+ async-timeout==5.0.1
37
+ tqdm==4.67.1
38
+ confection==0.1.5
39
+ six==1.17.0
40
+ portalocker==3.1.1
41
+ regex==2024.11.6
42
+ nvidia-cuda-runtime-cu12==12.4.127
43
+ packaging==24.2
44
+ annotated-types==0.7.0
45
+ salesforce-lavis==1.0.2
46
+ nvidia-nvjitlink-cu12==12.4.127
47
+ lightning-utilities==0.14.3
48
+ pytz==2025.2
49
+ smart-open==7.1.0
50
+ cachetools==5.5.2
51
+ nltk==3.9.1
52
+ torchmetrics==1.7.1
53
+ pexpect==4.9.0
54
+ jsonschema-specifications==2025.4.1
55
+ Jinja2==3.1.6
56
+ hjson==3.1.0
57
+ nvidia-cusparse-cu12==12.3.1.170
58
+ nvidia-cudnn-cu12==9.1.0.70
59
+ decord==0.6.0
60
+ joblib==1.5.1
61
+ kaggle==1.7.4.5
62
+ psutil==7.0.0
63
+ absl-py==2.2.2
64
+ Pygments==2.19.1
65
+ idna==3.10
66
+ aiohappyeyeballs==2.6.1
67
+ tornado==6.5.1
68
+ cycler==0.12.1
69
+ deepspeed==0.16.10+b666844f
70
+ torchvision==0.21.0
71
+ exceptiongroup==1.3.0
72
+ cfgv==3.4.0
73
+ py-cpuinfo==9.0.0
74
+ webdataset==0.2.111
75
+ murmurhash==1.0.13
76
+ asttokens==3.0.0
77
+ spacy==3.8.7
78
+ blinker==1.9.0
79
+ python-dateutil==2.9.0.post0
80
+ prompt_toolkit==3.0.51
81
+ referencing==0.36.2
82
+ contourpy==1.3.2
83
+ mpmath==1.3.0
84
+ thinc==8.3.6
85
+ pycocotools==2.0.8
86
+ python-magic==0.4.27
87
+ fairscale==0.4.4
88
+ nodeenv==1.9.1
89
+ identify==2.6.12
90
+ ftfy==6.3.1
91
+ spacy-legacy==3.0.12
92
+ cymem==2.0.11
93
+ typing-inspection==0.4.1
94
+ nvidia-cufile-cu12==1.11.1.6
95
+ filelock==3.18.0
96
+ language_data==1.3.0
97
+ iopath==0.1.10
98
+ pre_commit==4.2.0
99
+ toml==0.10.2
100
+ lazy_loader==0.4
101
+ nvidia-cusparselt-cu12==0.6.2
102
+ nvidia-nvtx-cu12==12.4.127
103
+ rouge_score==0.1.2
104
+ pycocoevalcap==1.2
105
+ pyparsing==3.2.3
106
+ mdurl==0.1.2
107
+ pure_eval==0.2.3
108
+ ipython==8.36.0
109
+ langcodes==3.5.0
110
+ distlib==0.3.9
111
+ pydeck==0.9.1
112
+ traitlets==5.14.3
113
+ decorator==5.2.1
114
+ requests==2.32.3
115
+ pydantic_core==2.33.2
116
+ matplotlib-inline==0.1.7
117
+ hf-xet==1.1.2
118
+ opendatasets==0.1.22
119
+ attrs==25.3.0
120
+ urllib3==2.4.0
121
+ typing_extensions==4.13.2
122
+ bleach==6.2.0
123
+ rich==14.0.0
124
+ imageio==2.37.0
125
+ yarl==1.20.0
126
+ platformdirs==4.3.8
127
+ multidict==6.4.4
128
+ catalogue==2.0.10
129
+ wasabi==1.1.3
130
+ scikit-image==0.25.2
131
+ blis==1.3.0
132
+ pyarrow==20.0.0
133
+ parso==0.8.4
134
+ rpds-py==0.25.1
135
+ opencv-python-headless==4.5.5.64
136
+ braceexpand==0.1.7
137
+ frozenlist==1.6.0
138
+ numpy==2.2.6
139
+ cloudpathlib==0.21.1
140
+ srsly==2.5.1
141
+ webencodings==0.5.1
142
+ nvidia-cuda-cupti-cu12==12.4.127
143
+ einops==0.8.1
144
+ setuptools==78.1.1
145
+ nvidia-nccl-cu12==2.21.5
146
+ ptyprocess==0.7.0
147
+ torch==2.6.0
148
+ scipy==1.15.3
149
+ nvidia-ml-py==12.575.51
150
+ aiosignal==1.3.2
151
+ virtualenv==20.31.2
152
+ protobuf==6.31.0
153
+ contexttimer==0.3.3
154
+ marisa-trie==1.2.1
155
+ shellingham==1.5.4
156
+ charset-normalizer==3.4.2
157
+ propcache==0.3.1
158
+ executing==2.2.0
159
+ pytorch-lightning==2.5.1.post0
160
+ stack-data==0.6.3
161
+ sentencepiece==0.2.0
162
+ sympy==1.13.1
163
+ wheel==0.45.1
164
+ safetensors==0.5.3
165
+ triton==3.2.0
166
+ watchdog==6.0.0
167
+ spacy-loggers==1.0.5
168
+ timm==0.4.12
169
+ docker-pycreds==0.4.0
170
+ setproctitle==1.3.6
171
+ jmespath==0.10.0
172
+ pycryptodome==3.23.0
173
+ opendelta==0.3.2
174
+ aliyun-python-sdk-core==2.16.0
175
+ dill==0.3.8
176
+ xxhash==3.5.0
177
+ crcmod==1.7
178
+ aiohttp==3.12.2
179
+ sentry-sdk==2.29.1
180
+ huggingface-hub==0.32.1
181
+ jaraco.functools==4.1.0
182
+ pathlib==1.0.1
183
+ multiprocess==0.70.16
184
+ flash-attn==2.7.1.post1
185
+ jsonschema==4.24.0
186
+ datasets==3.6.0
187
+ cffi==1.17.1
188
+ pycparser==2.22
189
+ fsspec==2025.3.0
190
+ more-itertools==10.7.0
191
+ cryptography==45.0.3
192
+ tokenizers==0.21.1
193
+ cheroot==10.0.1
194
+ pip==25.1.1
195
+ preshed==3.0.10
196
+ transformers==4.52.3
197
+ oss2==2.15.0
198
+ yacs==0.1.8
199
+ wandb==0.19.11
200
+ bigmodelvis==0.0.1
201
+ web.py==0.62
202
+ opencv-python==4.11.0.86
203
+ threadpoolctl==3.6.0
204
+ typer==0.16.0
205
+ narwhals==1.41.0
206
+ delta-center-client==0.0.4
207
+ aliyun-python-sdk-kms==2.16.5
208
+ click==8.2.1
209
+ scikit-learn==1.6.1
210
+ jaraco.text==3.12.1
211
+ autocommand==2.2.2
212
+ packaging==24.2
213
+ jaraco.context==5.3.0
214
+ tomli==2.0.1
215
+ typeguard==4.3.0
216
+ zipp==3.19.2
217
+ backports.tarfile==1.2.0
218
+ typing_extensions==4.12.2
219
+ jaraco.collections==5.1.0
220
+ inflect==7.3.1
221
+ more-itertools==10.3.0
222
+ jaraco.functools==4.0.1
223
+ importlib_metadata==8.0.0
224
+ platformdirs==4.2.2
225
+ wheel==0.45.1
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-metadata.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-07-06T21:21:04.133606Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage2_07070513_2datasets_construct",
12
+ "--num_query_token",
13
+ "8",
14
+ "--save_every_n_epochs",
15
+ "1",
16
+ "--max_epochs",
17
+ "10",
18
+ "--batch_size",
19
+ "8",
20
+ "--precision",
21
+ "bf16-mixed",
22
+ "--num_workers",
23
+ "8",
24
+ "--plm_model",
25
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
26
+ "--bert_name",
27
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
28
+ "--llm_name",
29
+ "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
30
+ "--llm_tune",
31
+ "mid_lora",
32
+ "--mix_dataset",
33
+ "--stage1_path",
34
+ "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt",
35
+ "--use_wandb_logger"
36
+ ],
37
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py",
38
+ "codePath": "stage2.py",
39
+ "email": "gia0603yucca@gmail.com",
40
+ "root": "./all_checkpoints/stage2_07070513_2datasets_construct/",
41
+ "host": "dsw-251511-69b5b47496-4bcxh",
42
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
43
+ "codePathLocal": "stage2.py",
44
+ "cpu_count": 64,
45
+ "cpu_count_logical": 64,
46
+ "gpu": "NVIDIA A800-SXM4-80GB",
47
+ "gpu_count": 8,
48
+ "disk": {
49
+ "/": {
50
+ "total": "1623302262784",
51
+ "used": "1260380160"
52
+ }
53
+ },
54
+ "memory": {
55
+ "total": "549755813888"
56
+ },
57
+ "cpu": {
58
+ "count": 64,
59
+ "countLogical": 64
60
+ },
61
+ "gpu_nvidia": [
62
+ {
63
+ "name": "NVIDIA A800-SXM4-80GB",
64
+ "memoryTotal": "85198045184",
65
+ "architecture": "Ampere"
66
+ },
67
+ {
68
+ "name": "NVIDIA A800-SXM4-80GB",
69
+ "memoryTotal": "85198045184",
70
+ "architecture": "Ampere"
71
+ },
72
+ {
73
+ "name": "NVIDIA A800-SXM4-80GB",
74
+ "memoryTotal": "85198045184",
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA A800-SXM4-80GB",
79
+ "memoryTotal": "85198045184",
80
+ "architecture": "Ampere"
81
+ },
82
+ {
83
+ "name": "NVIDIA A800-SXM4-80GB",
84
+ "memoryTotal": "85198045184",
85
+ "architecture": "Ampere"
86
+ },
87
+ {
88
+ "name": "NVIDIA A800-SXM4-80GB",
89
+ "memoryTotal": "85198045184",
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A800-SXM4-80GB",
94
+ "memoryTotal": "85198045184",
95
+ "architecture": "Ampere"
96
+ },
97
+ {
98
+ "name": "NVIDIA A800-SXM4-80GB",
99
+ "memoryTotal": "85198045184",
100
+ "architecture": "Ampere"
101
+ }
102
+ ],
103
+ "cudaVersion": "12.1"
104
+ }
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"epoch":0,"trainer/global_step":99,"_timestamp":1.7518373296110501e+09,"_runtime":465.477642119,"_step":1,"loss":1.32979416847229,"_wandb":{"runtime":533},"lr":1.0800999916682485e-05}
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-internal.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-07T05:21:04.137926867+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-core.log"}
2
+ {"time":"2025-07-07T05:21:05.224571234+08:00","level":"INFO","msg":"created new stream","id":"615z4bme"}
3
+ {"time":"2025-07-07T05:21:05.224615496+08:00","level":"INFO","msg":"stream: started","id":"615z4bme"}
4
+ {"time":"2025-07-07T05:21:05.224662595+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"615z4bme"}
5
+ {"time":"2025-07-07T05:21:05.224708291+08:00","level":"INFO","msg":"handler: started","stream_id":"615z4bme"}
6
+ {"time":"2025-07-07T05:21:05.22467591+08:00","level":"INFO","msg":"sender: started","stream_id":"615z4bme"}
7
+ {"time":"2025-07-07T05:21:06.409908065+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-07-07T05:29:57.639658414+08:00","level":"INFO","msg":"stream: closing","id":"615z4bme"}
9
+ {"time":"2025-07-07T05:29:57.639718652+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-07-07T05:29:57.64116529+08:00","level":"INFO","msg":"Stopped system monitor"}
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Configure stats pid to 2481
3
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug.log
7
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/logs/debug-internal.log
8
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():893] starting backend
12
+ 2025-07-07 05:21:04,126 INFO MainThread:2481 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-07 05:21:04,129 INFO MainThread:2481 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-07 05:21:04,132 INFO MainThread:2481 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-07 05:21:04,135 INFO MainThread:2481 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-07 05:21:04,138 INFO MainThread:2481 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-07 05:21:06,399 INFO MainThread:2481 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-07 05:21:06,613 INFO MainThread:2481 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-07 05:21:06,614 INFO MainThread:2481 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-07 05:21:06,619 INFO MainThread:2481 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-07 05:21:06,619 INFO MainThread:2481 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-07 05:21:06,620 INFO MainThread:2481 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-07-07 05:25:59,072 INFO MainThread:2481 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070513_2datasets_construct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 1, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 8, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
24
+ 2025-07-07 05:29:57,621 INFO MsgRouterThr:2481 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_052104-615z4bme/run-615z4bme.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5d1bbe7396b4e8ab1a6d6cf3abef6965bcd254b0974ca9172975647a4cc3e5a
3
+ size 196608
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/config.yaml ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ - "1": loss
10
+ "5": 1
11
+ "6":
12
+ - 1
13
+ - 3
14
+ "7": []
15
+ - "1": lr
16
+ "5": 1
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": dataset0/rouge_1
22
+ "5": 1
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": dataset0/rouge_l
28
+ "5": 1
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": dataset0/meteor_score
34
+ "5": 1
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": dataset0/bleu2
40
+ "5": 1
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": dataset0/bleu4
46
+ "5": 1
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": epoch
52
+ "5": 1
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": dataloader2/val loss/dataloader_idx_2
58
+ "5": 1
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": dataloader0/val loss/dataloader_idx_0
64
+ "5": 1
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": dataset0/acc
70
+ "5": 1
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": dataset0/rouge_2
76
+ "5": 1
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ python_version: 3.10.0
82
+ t:
83
+ "1":
84
+ - 1
85
+ - 5
86
+ - 9
87
+ - 11
88
+ - 33
89
+ - 41
90
+ - 49
91
+ - 53
92
+ - 55
93
+ - 63
94
+ - 103
95
+ "2":
96
+ - 1
97
+ - 5
98
+ - 9
99
+ - 11
100
+ - 33
101
+ - 41
102
+ - 49
103
+ - 53
104
+ - 55
105
+ - 63
106
+ - 103
107
+ "3":
108
+ - 7
109
+ - 23
110
+ - 55
111
+ - 66
112
+ "4": 3.10.0
113
+ "5": 0.19.11
114
+ "6": 4.52.3
115
+ "8":
116
+ - 5
117
+ "12": 0.19.11
118
+ "13": linux-x86_64
119
+ a_max_len:
120
+ value: 36
121
+ accelerator:
122
+ value: gpu
123
+ accumulate_grad_batches:
124
+ value: 1
125
+ batch_size:
126
+ value: 4
127
+ bert_name:
128
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
129
+ caption_eval_epoch:
130
+ value: 10
131
+ check_val_every_n_epoch:
132
+ value: 1
133
+ cross_attention_freq:
134
+ value: 2
135
+ devices:
136
+ value: 0,1,2,3,4,5,6,7
137
+ do_sample:
138
+ value: false
139
+ enable_flash:
140
+ value: false
141
+ enbale_gradient_checkpointing:
142
+ value: false
143
+ filename:
144
+ value: stage2_07070513_2datasets_construct
145
+ filter_side_qa:
146
+ value: false
147
+ inference_batch_size:
148
+ value: 4
149
+ init_checkpoint:
150
+ value: ""
151
+ init_lr:
152
+ value: 0.0001
153
+ llm_name:
154
+ value: /oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged
155
+ llm_tune:
156
+ value: mid_lora
157
+ lora_alpha:
158
+ value: 16
159
+ lora_dropout:
160
+ value: 0.1
161
+ lora_r:
162
+ value: 8
163
+ lr_decay_rate:
164
+ value: 0.9
165
+ max_epochs:
166
+ value: 10
167
+ max_inference_len:
168
+ value: 128
169
+ min_inference_len:
170
+ value: 1
171
+ min_lr:
172
+ value: 1e-05
173
+ mix_dataset:
174
+ value: true
175
+ mode:
176
+ value: train
177
+ num_beams:
178
+ value: 5
179
+ num_query_token:
180
+ value: 8
181
+ num_workers:
182
+ value: 8
183
+ peft_config:
184
+ value: ""
185
+ peft_dir:
186
+ value: ""
187
+ plm_model:
188
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
189
+ plm_tune:
190
+ value: freeze
191
+ precision:
192
+ value: bf16-mixed
193
+ prompt:
194
+ value: 'The protein has the following properties: '
195
+ prot_max_len:
196
+ value: 1024
197
+ q_max_len:
198
+ value: 29
199
+ root:
200
+ value: data
201
+ save_every_n_epochs:
202
+ value: 1
203
+ scheduler:
204
+ value: linear_warmup_cosine_lr
205
+ seed:
206
+ value: 42
207
+ stage1_path:
208
+ value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt
209
+ stage2_path:
210
+ value: ""
211
+ strategy:
212
+ value: deepspeed
213
+ text_max_len:
214
+ value: 1024
215
+ use_wandb_logger:
216
+ value: true
217
+ warmup_lr:
218
+ value: 1e-06
219
+ warmup_steps:
220
+ value: 1000
221
+ weight_decay:
222
+ value: 0.05
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/output.log ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07070513_2datasets_construct exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ -------------------------------------------
7
+ 0 | blip2 | Blip2OPT | 7.9 B | train
8
+ -------------------------------------------
9
+ 104 M Trainable params
10
+ 7.8 B Non-trainable params
11
+ 7.9 B Total params
12
+ 31,459.025Total estimated model params size (MB)
13
+ 174 Modules in train mode
14
+ 1203 Modules in eval mode
15
+ Epoch 9: 100%|████████████████████████████████████████████████████████████████| 26653/26653 [6:28:32<00:00, 1.14it/s, v_num=n0v3]BLEU-2 score: 26.653377377338177
16
+ BLEU-4 score: 20.796051979558282████████████████████████████████████████████████████████████████| 313/313 [23:19<00:00, 0.22it/s]
17
+ /nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
18
+ sd = self.module.state_dict(destination, prefix, keep_vars)
19
+ 20000it [01:34, 212.73it/s]
20
+ 20000it [00:35, 568.59it/s]
21
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
22
+ Average Meteor score: 27.78854434762939
23
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
24
+ ROUGE score:
25
+ rouge1: 32.64128335687479
26
+ rouge2: 20.424108167541117
27
+ rougeL: 27.292150023395635
28
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu4', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
29
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_1', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
30
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
31
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_l', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
32
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/meteor_score', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
33
+ Epoch 9: 100%|████████████████████████████████████████████████████████████████| 26653/26653 [7:22:40<00:00, 1.00it/s, v_num=n0v3]
34
+
35
+ `Trainer.fit` stopped: `max_epochs=10` reached.
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gitdb==4.0.12
2
+ smmap==5.0.2
3
+ wcwidth==0.2.13
4
+ streamlit==1.45.1
5
+ antlr4-python3-runtime==4.9.3
6
+ MarkupSafe==3.0.2
7
+ markdown-it-py==3.0.0
8
+ PyYAML==6.0.2
9
+ nvidia-cusolver-cu12==11.6.1.9
10
+ text-unidecode==1.3
11
+ msgpack==1.1.0
12
+ pillow==11.2.1
13
+ wrapt==1.17.2
14
+ tifffile==2025.5.10
15
+ nvidia-curand-cu12==10.3.5.147
16
+ networkx==3.4.2
17
+ fonttools==4.58.0
18
+ plotly==6.1.1
19
+ matplotlib==3.10.3
20
+ certifi==2025.4.26
21
+ altair==5.5.0
22
+ nvidia-cufft-cu12==11.2.1.3
23
+ ninja==1.11.1.4
24
+ tzdata==2025.2
25
+ nvidia-cublas-cu12==12.4.5.8
26
+ weasel==0.4.1
27
+ nvidia-cuda-nvrtc-cu12==12.4.127
28
+ jedi==0.19.2
29
+ GitPython==3.1.44
30
+ pandas==2.2.3
31
+ python-slugify==8.0.4
32
+ omegaconf==2.3.0
33
+ kiwisolver==1.4.8
34
+ tenacity==9.1.2
35
+ pydantic==2.11.5
36
+ async-timeout==5.0.1
37
+ tqdm==4.67.1
38
+ confection==0.1.5
39
+ six==1.17.0
40
+ portalocker==3.1.1
41
+ regex==2024.11.6
42
+ nvidia-cuda-runtime-cu12==12.4.127
43
+ packaging==24.2
44
+ annotated-types==0.7.0
45
+ salesforce-lavis==1.0.2
46
+ nvidia-nvjitlink-cu12==12.4.127
47
+ lightning-utilities==0.14.3
48
+ pytz==2025.2
49
+ smart-open==7.1.0
50
+ cachetools==5.5.2
51
+ nltk==3.9.1
52
+ torchmetrics==1.7.1
53
+ pexpect==4.9.0
54
+ jsonschema-specifications==2025.4.1
55
+ Jinja2==3.1.6
56
+ hjson==3.1.0
57
+ nvidia-cusparse-cu12==12.3.1.170
58
+ nvidia-cudnn-cu12==9.1.0.70
59
+ decord==0.6.0
60
+ joblib==1.5.1
61
+ kaggle==1.7.4.5
62
+ psutil==7.0.0
63
+ absl-py==2.2.2
64
+ Pygments==2.19.1
65
+ idna==3.10
66
+ aiohappyeyeballs==2.6.1
67
+ tornado==6.5.1
68
+ cycler==0.12.1
69
+ deepspeed==0.16.10+b666844f
70
+ torchvision==0.21.0
71
+ exceptiongroup==1.3.0
72
+ cfgv==3.4.0
73
+ py-cpuinfo==9.0.0
74
+ webdataset==0.2.111
75
+ murmurhash==1.0.13
76
+ asttokens==3.0.0
77
+ spacy==3.8.7
78
+ blinker==1.9.0
79
+ python-dateutil==2.9.0.post0
80
+ prompt_toolkit==3.0.51
81
+ referencing==0.36.2
82
+ contourpy==1.3.2
83
+ mpmath==1.3.0
84
+ thinc==8.3.6
85
+ pycocotools==2.0.8
86
+ python-magic==0.4.27
87
+ fairscale==0.4.4
88
+ nodeenv==1.9.1
89
+ identify==2.6.12
90
+ ftfy==6.3.1
91
+ spacy-legacy==3.0.12
92
+ cymem==2.0.11
93
+ typing-inspection==0.4.1
94
+ nvidia-cufile-cu12==1.11.1.6
95
+ filelock==3.18.0
96
+ language_data==1.3.0
97
+ iopath==0.1.10
98
+ pre_commit==4.2.0
99
+ toml==0.10.2
100
+ lazy_loader==0.4
101
+ nvidia-cusparselt-cu12==0.6.2
102
+ nvidia-nvtx-cu12==12.4.127
103
+ rouge_score==0.1.2
104
+ pycocoevalcap==1.2
105
+ pyparsing==3.2.3
106
+ mdurl==0.1.2
107
+ pure_eval==0.2.3
108
+ ipython==8.36.0
109
+ langcodes==3.5.0
110
+ distlib==0.3.9
111
+ pydeck==0.9.1
112
+ traitlets==5.14.3
113
+ decorator==5.2.1
114
+ requests==2.32.3
115
+ pydantic_core==2.33.2
116
+ matplotlib-inline==0.1.7
117
+ hf-xet==1.1.2
118
+ opendatasets==0.1.22
119
+ attrs==25.3.0
120
+ urllib3==2.4.0
121
+ typing_extensions==4.13.2
122
+ bleach==6.2.0
123
+ rich==14.0.0
124
+ imageio==2.37.0
125
+ yarl==1.20.0
126
+ platformdirs==4.3.8
127
+ multidict==6.4.4
128
+ catalogue==2.0.10
129
+ wasabi==1.1.3
130
+ scikit-image==0.25.2
131
+ blis==1.3.0
132
+ pyarrow==20.0.0
133
+ parso==0.8.4
134
+ rpds-py==0.25.1
135
+ opencv-python-headless==4.5.5.64
136
+ braceexpand==0.1.7
137
+ frozenlist==1.6.0
138
+ numpy==2.2.6
139
+ cloudpathlib==0.21.1
140
+ srsly==2.5.1
141
+ webencodings==0.5.1
142
+ nvidia-cuda-cupti-cu12==12.4.127
143
+ einops==0.8.1
144
+ setuptools==78.1.1
145
+ nvidia-nccl-cu12==2.21.5
146
+ ptyprocess==0.7.0
147
+ torch==2.6.0
148
+ scipy==1.15.3
149
+ nvidia-ml-py==12.575.51
150
+ aiosignal==1.3.2
151
+ virtualenv==20.31.2
152
+ protobuf==6.31.0
153
+ contexttimer==0.3.3
154
+ marisa-trie==1.2.1
155
+ shellingham==1.5.4
156
+ charset-normalizer==3.4.2
157
+ propcache==0.3.1
158
+ executing==2.2.0
159
+ pytorch-lightning==2.5.1.post0
160
+ stack-data==0.6.3
161
+ sentencepiece==0.2.0
162
+ sympy==1.13.1
163
+ wheel==0.45.1
164
+ safetensors==0.5.3
165
+ triton==3.2.0
166
+ watchdog==6.0.0
167
+ spacy-loggers==1.0.5
168
+ timm==0.4.12
169
+ docker-pycreds==0.4.0
170
+ setproctitle==1.3.6
171
+ jmespath==0.10.0
172
+ pycryptodome==3.23.0
173
+ opendelta==0.3.2
174
+ aliyun-python-sdk-core==2.16.0
175
+ dill==0.3.8
176
+ xxhash==3.5.0
177
+ crcmod==1.7
178
+ aiohttp==3.12.2
179
+ sentry-sdk==2.29.1
180
+ huggingface-hub==0.32.1
181
+ jaraco.functools==4.1.0
182
+ pathlib==1.0.1
183
+ multiprocess==0.70.16
184
+ flash-attn==2.7.1.post1
185
+ jsonschema==4.24.0
186
+ datasets==3.6.0
187
+ cffi==1.17.1
188
+ pycparser==2.22
189
+ fsspec==2025.3.0
190
+ more-itertools==10.7.0
191
+ cryptography==45.0.3
192
+ tokenizers==0.21.1
193
+ cheroot==10.0.1
194
+ pip==25.1.1
195
+ preshed==3.0.10
196
+ transformers==4.52.3
197
+ oss2==2.15.0
198
+ yacs==0.1.8
199
+ wandb==0.19.11
200
+ bigmodelvis==0.0.1
201
+ web.py==0.62
202
+ opencv-python==4.11.0.86
203
+ threadpoolctl==3.6.0
204
+ typer==0.16.0
205
+ narwhals==1.41.0
206
+ delta-center-client==0.0.4
207
+ aliyun-python-sdk-kms==2.16.5
208
+ click==8.2.1
209
+ scikit-learn==1.6.1
210
+ jaraco.text==3.12.1
211
+ autocommand==2.2.2
212
+ packaging==24.2
213
+ jaraco.context==5.3.0
214
+ tomli==2.0.1
215
+ typeguard==4.3.0
216
+ zipp==3.19.2
217
+ backports.tarfile==1.2.0
218
+ typing_extensions==4.12.2
219
+ jaraco.collections==5.1.0
220
+ inflect==7.3.1
221
+ more-itertools==10.3.0
222
+ jaraco.functools==4.0.1
223
+ importlib_metadata==8.0.0
224
+ platformdirs==4.2.2
225
+ wheel==0.45.1
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-metadata.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-07-06T21:32:22.534976Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage2_07070513_2datasets_construct",
12
+ "--num_query_token",
13
+ "8",
14
+ "--save_every_n_epochs",
15
+ "1",
16
+ "--max_epochs",
17
+ "10",
18
+ "--batch_size",
19
+ "4",
20
+ "--precision",
21
+ "bf16-mixed",
22
+ "--num_workers",
23
+ "8",
24
+ "--plm_model",
25
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
26
+ "--bert_name",
27
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
28
+ "--llm_name",
29
+ "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
30
+ "--llm_tune",
31
+ "mid_lora",
32
+ "--mix_dataset",
33
+ "--stage1_path",
34
+ "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt",
35
+ "--use_wandb_logger"
36
+ ],
37
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py",
38
+ "codePath": "stage2.py",
39
+ "email": "gia0603yucca@gmail.com",
40
+ "root": "./all_checkpoints/stage2_07070513_2datasets_construct/",
41
+ "host": "dsw-251511-69b5b47496-4bcxh",
42
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
43
+ "codePathLocal": "stage2.py",
44
+ "cpu_count": 64,
45
+ "cpu_count_logical": 64,
46
+ "gpu": "NVIDIA A800-SXM4-80GB",
47
+ "gpu_count": 8,
48
+ "disk": {
49
+ "/": {
50
+ "total": "1623302262784",
51
+ "used": "1260400640"
52
+ }
53
+ },
54
+ "memory": {
55
+ "total": "549755813888"
56
+ },
57
+ "cpu": {
58
+ "count": 64,
59
+ "countLogical": 64
60
+ },
61
+ "gpu_nvidia": [
62
+ {
63
+ "name": "NVIDIA A800-SXM4-80GB",
64
+ "memoryTotal": "85198045184",
65
+ "architecture": "Ampere"
66
+ },
67
+ {
68
+ "name": "NVIDIA A800-SXM4-80GB",
69
+ "memoryTotal": "85198045184",
70
+ "architecture": "Ampere"
71
+ },
72
+ {
73
+ "name": "NVIDIA A800-SXM4-80GB",
74
+ "memoryTotal": "85198045184",
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA A800-SXM4-80GB",
79
+ "memoryTotal": "85198045184",
80
+ "architecture": "Ampere"
81
+ },
82
+ {
83
+ "name": "NVIDIA A800-SXM4-80GB",
84
+ "memoryTotal": "85198045184",
85
+ "architecture": "Ampere"
86
+ },
87
+ {
88
+ "name": "NVIDIA A800-SXM4-80GB",
89
+ "memoryTotal": "85198045184",
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A800-SXM4-80GB",
94
+ "memoryTotal": "85198045184",
95
+ "architecture": "Ampere"
96
+ },
97
+ {
98
+ "name": "NVIDIA A800-SXM4-80GB",
99
+ "memoryTotal": "85198045184",
100
+ "architecture": "Ampere"
101
+ }
102
+ ],
103
+ "cudaVersion": "12.1"
104
+ }
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":238677.754852379,"trainer/global_step":266529,"dataloader0/val loss/dataloader_idx_0":0.3741031885147095,"dataset0/acc":0,"dataset0/meteor_score":27.788543701171875,"_wandb":{"runtime":238690},"dataset0/bleu4":20.796052932739258,"dataset0/rouge_2":20.424108505249023,"lr":1.2202456673549023e-05,"_timestamp":1.7520762202895813e+09,"dataloader2/val loss/dataloader_idx_2":0.2067195624113083,"dataset0/rouge_1":32.64128494262695,"loss":0.114682637155056,"epoch":9,"dataset0/bleu2":26.653377532958984,"dataset0/rouge_l":27.292150497436523,"_step":5339}
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-07T05:32:22.544190733+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-core.log"}
2
+ {"time":"2025-07-07T05:32:23.597067843+08:00","level":"INFO","msg":"created new stream","id":"9cjzn0v3"}
3
+ {"time":"2025-07-07T05:32:23.59711309+08:00","level":"INFO","msg":"stream: started","id":"9cjzn0v3"}
4
+ {"time":"2025-07-07T05:32:23.59715533+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9cjzn0v3"}
5
+ {"time":"2025-07-07T05:32:23.597176058+08:00","level":"INFO","msg":"handler: started","stream_id":"9cjzn0v3"}
6
+ {"time":"2025-07-07T05:32:23.597249736+08:00","level":"INFO","msg":"sender: started","stream_id":"9cjzn0v3"}
7
+ {"time":"2025-07-07T05:32:24.815832776+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-07-07T16:23:26.191588391+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:46082->172.67.193.61:443: read: connection timed out"}
9
+ {"time":"2025-07-08T07:36:01.662714436+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
10
+ {"time":"2025-07-08T07:39:35.510926561+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
11
+ {"time":"2025-07-09T00:01:13.718163538+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:52128->172.67.193.61:443: read: connection reset by peer"}
12
+ {"time":"2025-07-09T00:04:13.715227056+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:56708->104.21.20.172:443: read: connection reset by peer"}
13
+ {"time":"2025-07-09T00:41:59.079495986+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:39968->172.67.193.61:443: read: connection reset by peer"}
14
+ {"time":"2025-07-09T00:50:28.436723591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
15
+ {"time":"2025-07-09T01:04:28.736382048+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:48068->104.21.20.172:443: read: connection reset by peer"}
16
+ {"time":"2025-07-09T01:36:13.71400828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": read tcp 10.1.2.136:33646->172.67.193.61:443: read: connection reset by peer"}
17
+ {"time":"2025-07-09T06:33:13.899246984+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
18
+ {"time":"2025-07-09T14:42:33.327607005+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:36168->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-07-09T22:35:09.035751509+08:00","level":"INFO","msg":"api: retrying HTTP error","status":504,"url":"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream","body":"error code: 504"}
20
+ {"time":"2025-07-09T23:18:37.03957561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56104->172.67.193.61:443: read: connection timed out"}
21
+ {"time":"2025-07-09T23:35:24.650683333+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07070513_2datasets_construct/9cjzn0v3/file_stream\": read tcp 10.1.2.136:56892->172.67.193.61:443: read: connection reset by peer"}
22
+ {"time":"2025-07-09T23:50:32.561736786+08:00","level":"INFO","msg":"stream: closing","id":"9cjzn0v3"}
23
+ {"time":"2025-07-09T23:50:32.56179589+08:00","level":"INFO","msg":"Stopping system monitor"}
24
+ {"time":"2025-07-09T23:50:32.564495033+08:00","level":"INFO","msg":"Stopped system monitor"}
25
+ {"time":"2025-07-09T23:50:38.466118847+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
26
+ {"time":"2025-07-09T23:50:40.207050581+08:00","level":"INFO","msg":"handler: closed","stream_id":"9cjzn0v3"}
27
+ {"time":"2025-07-09T23:50:40.207095276+08:00","level":"INFO","msg":"sender: closed","stream_id":"9cjzn0v3"}
28
+ {"time":"2025-07-09T23:50:40.207092571+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"9cjzn0v3"}
29
+ {"time":"2025-07-09T23:50:40.211547321+08:00","level":"INFO","msg":"stream: closed","id":"9cjzn0v3"}
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Configure stats pid to 9598
3
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-07 05:32:22,527 INFO MainThread:9598 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug.log
7
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/logs/debug-internal.log
8
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():893] starting backend
12
+ 2025-07-07 05:32:22,528 INFO MainThread:9598 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-07 05:32:22,529 INFO MainThread:9598 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-07 05:32:22,531 INFO MainThread:9598 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-07 05:32:22,535 INFO MainThread:9598 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-07 05:32:22,540 INFO MainThread:9598 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-07 05:32:24,807 INFO MainThread:9598 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-07 05:32:24,956 INFO MainThread:9598 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-07 05:32:24,959 INFO MainThread:9598 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-07 05:32:24,961 INFO MainThread:9598 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-07-07 05:32:33,644 INFO MainThread:9598 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07070513_2datasets_construct', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 10, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 1, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset/epoch=29.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 4, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 1024, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
24
+ 2025-07-09 23:50:32,550 INFO MsgRouterThr:9598 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
all_checkpoints/stage2_07070513_2datasets_construct/wandb/run-20250707_053222-9cjzn0v3/run-9cjzn0v3.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6cffb771629ea66d100de0be9a1ef1c3f9599c478045c824d755e2ea04fe379
3
+ size 199737973
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:246a0d8dbc7414f986a333879b13c36671f129b3b117d7b4066f3928cb35bc99
3
+ size 156403632
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c9fc89299dca46abb0b74a86e998a1f7b2026cca0ce2dcf590da1a68df2186
3
+ size 156402992
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b9d65b0b283d3d437c8eb4955242a571b50d7503bb4d371742052fe466db312
3
+ size 156403376
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:735a857a56751f34bbcf2516fd6b038ff0617777b78e86a12ba3d5d4181e8119
3
+ size 156403120
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e14097721553b19f34683b33ddb67fa3d670e035d7260159b9e97aac7d7e851c
3
+ size 156402416
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe993f0a43264f371f9feef67c3aaf51480074bf4512a075fbcf26c11b405c6d
3
+ size 156403696
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:094d7605822331acc11f1cf58534bc7d48c4a66bd686ba9749042f97fe8def0e
3
+ size 156402992
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79ebf1ed60ade77ba2453feda74fbc5d71c1d55b9e7d67f1398ea6e6e7d45041
3
+ size 156417904
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5f1a07851a422c60545e9fa30da0dc892ea9ef21aa34e05d2e11791664d89b6
3
+ size 208795320
all_checkpoints/stage2_07301646_2datasets_construct/epoch=03.ckpt/converted.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f74c2b94f722a6ce4ebc2444d8247a103295f9673fbfab2529712ca70294557
3
+ size 417200548