yuccaaa commited on
Commit
c72f6bb
·
verified ·
1 Parent(s): 992a397

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug-internal.log +63 -0
  2. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug.log +23 -0
  3. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/files/output.log +0 -0
  4. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-internal.log +82 -0
  5. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug.log +94 -0
  6. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/run-ln8ma2mo.wandb +0 -0
  7. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/files/output.log +0 -0
  8. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-internal.log +82 -0
  9. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug.log +94 -0
  10. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/run-mz3ej8ig.wandb +0 -0
  11. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/config.yaml +129 -0
  12. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/output.log +21 -0
  13. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/requirements.txt +225 -0
  14. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-metadata.json +99 -0
  15. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-summary.json +1 -0
  16. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-internal.log +18 -0
  17. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug.log +23 -0
  18. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/run-d21a8n96.wandb +0 -0
  19. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/files/output.log +0 -0
  20. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-internal.log +16 -0
  21. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug.log +16 -0
  22. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/run-y2lylvs5.wandb +0 -0
  23. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/config.yaml +237 -0
  24. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/output.log +0 -0
  25. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/requirements.txt +225 -0
  26. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-metadata.json +99 -0
  27. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-summary.json +1 -0
  28. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log +42 -0
  29. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log +24 -0
  30. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml +236 -0
  31. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log +21 -0
  32. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt +225 -0
  33. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json +97 -0
  34. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json +1 -0
  35. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log +42 -0
  36. ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log +24 -0
  37. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug-internal.log +87 -0
  38. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug.log +24 -0
  39. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/config.yaml +236 -0
  40. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/output.log +21 -0
  41. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/requirements.txt +225 -0
  42. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-metadata.json +98 -0
  43. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-summary.json +1 -0
  44. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log +87 -0
  45. ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log +24 -0
  46. ProtT3/all_checkpoints/stage1_ckpt/wandb/debug-internal.log +19 -0
  47. ProtT3/all_checkpoints/stage1_ckpt/wandb/debug.log +23 -0
  48. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/config.yaml +129 -0
  49. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/output.log +2 -0
  50. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/requirements.txt +225 -0
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug-internal.log ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-29T00:07:02.130913564+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-core.log"}
2
+ {"time":"2025-06-29T00:07:16.339720801+08:00","level":"INFO","msg":"created new stream","id":"rypk39yq"}
3
+ {"time":"2025-06-29T00:07:16.340562919+08:00","level":"INFO","msg":"stream: started","id":"rypk39yq"}
4
+ {"time":"2025-06-29T00:07:16.340584288+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"rypk39yq"}
5
+ {"time":"2025-06-29T00:07:16.340617888+08:00","level":"INFO","msg":"sender: started","stream_id":"rypk39yq"}
6
+ {"time":"2025-06-29T00:07:16.340654242+08:00","level":"INFO","msg":"handler: started","stream_id":"rypk39yq"}
7
+ {"time":"2025-06-29T00:07:28.033909694+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-29T00:12:24.114755958+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52688->104.21.20.172:443: read: connection timed out"}
9
+ {"time":"2025-06-29T00:15:17.682707235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:43992->104.21.20.172:443: read: connection timed out"}
10
+ {"time":"2025-06-29T00:16:13.20335199+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-06-29T00:16:45.525802023+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-06-29T00:17:19.98711773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
13
+ {"time":"2025-06-29T00:18:06.642780387+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:36080->172.67.193.61:443: read: connection timed out"}
14
+ {"time":"2025-06-29T00:22:43.123257688+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52664->172.67.193.61:443: read: connection timed out"}
15
+ {"time":"2025-06-29T00:26:08.434737599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42534->172.67.193.61:443: read: connection timed out"}
16
+ {"time":"2025-06-29T00:27:44.454100719+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53006->104.21.20.172:443: read: connection reset by peer"}
17
+ {"time":"2025-06-29T00:29:13.211268181+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
18
+ {"time":"2025-06-29T00:29:45.68436365+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
19
+ {"time":"2025-06-29T00:30:19.759580601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
20
+ {"time":"2025-06-29T00:30:33.650730605+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:38754->172.67.193.61:443: read: connection timed out"}
21
+ {"time":"2025-06-29T00:30:58.011093426+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
22
+ {"time":"2025-06-29T00:34:39.922752645+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:35350->172.67.193.61:443: read: connection timed out"}
23
+ {"time":"2025-06-29T00:36:41.88529828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
24
+ {"time":"2025-06-29T00:37:20.878368218+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:46470->104.21.20.172:443: read: connection reset by peer"}
25
+ {"time":"2025-06-29T00:38:49.414424011+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
26
+ {"time":"2025-06-29T00:38:58.216757113+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
27
+ {"time":"2025-06-29T00:39:20.141003198+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53708->104.21.20.172:443: read: connection reset by peer"}
28
+ {"time":"2025-06-29T00:41:33.299264534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44198->104.21.20.172:443: read: connection reset by peer"}
29
+ {"time":"2025-06-29T00:47:37.138754922+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39138->172.67.193.61:443: read: connection timed out"}
30
+ {"time":"2025-06-29T00:54:28.224811124+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T00:55:15.429710397+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:49584->104.21.20.172:443: read: connection reset by peer"}
32
+ {"time":"2025-06-29T00:55:36.251525534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:55184->104.21.20.172:443: read: connection reset by peer"}
33
+ {"time":"2025-06-29T00:56:12.092902722+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": http2: client conn is closed"}
34
+ {"time":"2025-06-29T00:59:32.604209299+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44582->172.67.193.61:443: read: connection reset by peer"}
35
+ {"time":"2025-06-29T01:00:43.231046844+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
36
+ {"time":"2025-06-29T01:05:28.234577388+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
37
+ {"time":"2025-06-29T01:06:00.428439859+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
38
+ {"time":"2025-06-29T01:06:35.403033399+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
39
+ {"time":"2025-06-29T01:07:13.835463934+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
40
+ {"time":"2025-06-29T01:12:30.014897464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
41
+ {"time":"2025-06-29T01:14:58.239397356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
42
+ {"time":"2025-06-29T01:15:30.658073848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
43
+ {"time":"2025-06-29T01:16:05.133874663+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
44
+ {"time":"2025-06-29T01:16:43.256922452+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
45
+ {"time":"2025-06-29T01:17:07.122753765+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42208->172.67.193.61:443: read: connection timed out"}
46
+ {"time":"2025-06-29T01:17:31.631854783+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
47
+ {"time":"2025-06-29T01:18:38.479583401+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
48
+ {"time":"2025-06-29T01:20:08.481626584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
49
+ {"time":"2025-06-29T01:21:38.483904393+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
50
+ {"time":"2025-06-29T01:22:09.185192206+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
51
+ {"time":"2025-06-29T01:28:06.578759778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39408->172.67.193.61:443: read: connection timed out"}
52
+ {"time":"2025-06-29T02:00:40.766530394+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.bandw.top/graphql","body":"error code: 502"}
53
+ {"time":"2025-06-29T08:45:43.611887283+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
54
+ {"time":"2025-06-29T08:45:55.061157169+08:00","level":"INFO","msg":"api: retrying HTTP error","status":520,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 520"}
55
+ {"time":"2025-06-29T08:51:23.638432293+08:00","level":"INFO","msg":"api: retrying HTTP error","status":524,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 524"}
56
+ {"time":"2025-06-29T10:16:08.309722526+08:00","level":"INFO","msg":"stream: closing","id":"rypk39yq"}
57
+ {"time":"2025-06-29T10:16:08.309813211+08:00","level":"INFO","msg":"Stopping system monitor"}
58
+ {"time":"2025-06-29T10:16:08.311047133+08:00","level":"INFO","msg":"Stopped system monitor"}
59
+ {"time":"2025-06-29T10:16:10.887637294+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
60
+ {"time":"2025-06-29T10:16:11.831362524+08:00","level":"INFO","msg":"handler: closed","stream_id":"rypk39yq"}
61
+ {"time":"2025-06-29T10:16:11.831401295+08:00","level":"INFO","msg":"sender: closed","stream_id":"rypk39yq"}
62
+ {"time":"2025-06-29T10:16:11.831391+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rypk39yq"}
63
+ {"time":"2025-06-29T10:16:11.835883161+08:00","level":"INFO","msg":"stream: closed","id":"rypk39yq"}
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Configure stats pid to 938398
3
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log
7
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log
8
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():893] starting backend
12
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-29 00:07:02,122 INFO MainThread:938398 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-29 00:07:02,125 INFO MainThread:938398 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-29 00:07:02,126 INFO MainThread:938398 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-29 00:07:02,129 INFO MainThread:938398 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-29 00:07:27,982 INFO MainThread:938398 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-29 00:07:28,171 INFO MainThread:938398 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-29 00:07:28,172 INFO MainThread:938398 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-29 00:07:28,177 INFO MainThread:938398 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-29 10:16:08,240 INFO MsgRouterThr:938398 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/files/output.log ADDED
File without changes
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-internal.log ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T23:51:50.395952363+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-core.log"}
2
+ {"time":"2025-06-28T23:52:20.501357263+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
3
+ {"time":"2025-06-28T23:52:52.730097178+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
4
+ {"time":"2025-06-28T23:53:27.526291573+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
5
+ {"time":"2025-06-28T23:54:05.599297713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
6
+ {"time":"2025-06-28T23:54:54.222254418+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
7
+ {"time":"2025-06-28T23:56:01.770405008+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
8
+ {"time":"2025-06-28T23:57:31.772851692+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
9
+ {"time":"2025-06-28T23:59:01.774763186+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
10
+ {"time":"2025-06-29T00:00:31.777124138+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-06-29T00:02:01.779769246+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-06-29T00:03:31.781963651+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
13
+ {"time":"2025-06-29T00:05:01.784626624+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
14
+ {"time":"2025-06-29T00:06:31.787236433+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
15
+ {"time":"2025-06-29T00:08:01.789516886+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
16
+ {"time":"2025-06-29T00:09:31.792010561+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
17
+ {"time":"2025-06-29T00:11:01.793420114+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
18
+ {"time":"2025-06-29T00:12:31.795510043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
19
+ {"time":"2025-06-29T00:14:01.797888169+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
20
+ {"time":"2025-06-29T00:15:31.800223897+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
21
+ {"time":"2025-06-29T00:17:01.802475149+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
22
+ {"time":"2025-06-29T00:18:31.804423686+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
23
+ {"time":"2025-06-29T00:18:31.805466058+08:00","level":"ERROR","msg":"Failed to load features, feature will default to disabled","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
24
+ {"time":"2025-06-29T00:18:31.817218548+08:00","level":"INFO","msg":"created new stream","id":"ln8ma2mo"}
25
+ {"time":"2025-06-29T00:18:31.817240068+08:00","level":"INFO","msg":"stream: started","id":"ln8ma2mo"}
26
+ {"time":"2025-06-29T00:18:31.817280389+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"ln8ma2mo"}
27
+ {"time":"2025-06-29T00:18:31.817301971+08:00","level":"INFO","msg":"sender: started","stream_id":"ln8ma2mo"}
28
+ {"time":"2025-06-29T00:18:31.817292844+08:00","level":"INFO","msg":"handler: started","stream_id":"ln8ma2mo"}
29
+ {"time":"2025-06-29T00:18:31.817927543+08:00","level":"INFO","msg":"stream: closing","id":"ln8ma2mo"}
30
+ {"time":"2025-06-29T00:19:01.824830492+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T00:19:34.122683021+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
32
+ {"time":"2025-06-29T00:20:08.859081137+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
33
+ {"time":"2025-06-29T00:20:47.142810832+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
34
+ {"time":"2025-06-29T00:21:34.117938021+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
35
+ {"time":"2025-06-29T00:22:42.456861603+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
36
+ {"time":"2025-06-29T00:24:12.458977097+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
37
+ {"time":"2025-06-29T00:25:42.460628156+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
38
+ {"time":"2025-06-29T00:27:12.462982763+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
39
+ {"time":"2025-06-29T00:28:31.82404899+08:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000158503,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"elq8to0u7ndt\" always_send:true connection_id:\"127.0.0.1:48626\")"}
40
+ {"time":"2025-06-29T00:28:42.464486984+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
41
+ {"time":"2025-06-29T00:30:12.465370037+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
42
+ {"time":"2025-06-29T00:31:42.467621727+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
43
+ {"time":"2025-06-29T00:33:12.470149312+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
44
+ {"time":"2025-06-29T00:34:42.471834466+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
45
+ {"time":"2025-06-29T00:36:12.474312455+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
46
+ {"time":"2025-06-29T00:37:42.477105562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
47
+ {"time":"2025-06-29T00:38:31.82690936+08:00","level":"WARN","msg":"sender: taking a long time","seconds":1200.003036845,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"elq8to0u7ndt\" always_send:true connection_id:\"127.0.0.1:48626\")"}
48
+ {"time":"2025-06-29T00:39:12.479327707+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
49
+ {"time":"2025-06-29T00:40:42.479854506+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
50
+ {"time":"2025-06-29T00:42:12.482469938+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
51
+ {"time":"2025-06-29T00:43:42.484924406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
52
+ {"time":"2025-06-29T00:45:12.487708397+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
53
+ {"time":"2025-06-29T00:45:12.48877363+08:00","level":"ERROR","msg":"send: sendRun: failed to update run state: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
54
+ {"time":"2025-06-29T00:45:12.489160854+08:00","level":"ERROR","msg":"sender: upsertConfig: RunRecord is nil"}
55
+ {"time":"2025-06-29T00:45:12.489257088+08:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1600.665365029,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"elq8to0u7ndt\" always_send:true connection_id:\"127.0.0.1:48626\")"}
56
+ {"time":"2025-06-29T00:45:42.54537918+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
57
+ {"time":"2025-06-29T00:46:14.577405421+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
58
+ {"time":"2025-06-29T00:46:49.560523299+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
59
+ {"time":"2025-06-29T00:47:29.044951023+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
60
+ {"time":"2025-06-29T00:48:18.616524778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
61
+ {"time":"2025-06-29T00:49:24.200671698+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
62
+ {"time":"2025-06-29T00:50:54.203239638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
63
+ {"time":"2025-06-29T00:52:24.20538484+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
64
+ {"time":"2025-06-29T00:53:54.207195938+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
65
+ {"time":"2025-06-29T00:55:24.209241621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
66
+ {"time":"2025-06-29T00:56:54.212135665+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
67
+ {"time":"2025-06-29T00:58:24.214809507+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
68
+ {"time":"2025-06-29T00:59:54.217111858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
69
+ {"time":"2025-06-29T01:01:24.219045106+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
70
+ {"time":"2025-06-29T01:02:54.221102245+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
71
+ {"time":"2025-06-29T01:04:24.22398357+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
72
+ {"time":"2025-06-29T01:05:54.226547409+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
73
+ {"time":"2025-06-29T01:07:24.228484779+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
74
+ {"time":"2025-06-29T01:08:54.231281118+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
75
+ {"time":"2025-06-29T01:10:24.233099781+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
76
+ {"time":"2025-06-29T01:11:54.236192516+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
77
+ {"time":"2025-06-29T01:11:54.236277338+08:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
78
+ {"time":"2025-06-29T01:11:54.236659368+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
79
+ {"time":"2025-06-29T01:11:54.239839801+08:00","level":"INFO","msg":"handler: closed","stream_id":"ln8ma2mo"}
80
+ {"time":"2025-06-29T01:11:54.239896009+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ln8ma2mo"}
81
+ {"time":"2025-06-29T01:11:54.239911685+08:00","level":"INFO","msg":"sender: closed","stream_id":"ln8ma2mo"}
82
+ {"time":"2025-06-29T01:11:54.242908101+08:00","level":"INFO","msg":"stream: closed","id":"ln8ma2mo"}
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug.log ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Configure stats pid to 906089
3
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug.log
7
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/logs/debug-internal.log
8
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 23:51:50,386 INFO MainThread:906089 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 23:51:50,389 INFO MainThread:906089 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 23:51:50,390 INFO MainThread:906089 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 23:51:50,391 INFO MainThread:906089 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 23:51:50,393 INFO MainThread:906089 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 23:51:59,762 WARNING MainThread:906089 [wandb_init.py:init():1681] [no run ID] interrupted
18
+ Traceback (most recent call last):
19
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1677, in init
20
+ return wi.init(run_settings, run_config, run_printer)
21
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1055, in init
22
+ result = wait_with_progress(
23
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
24
+ return wait_all_with_progress(
25
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
26
+ return asyncio_compat.run(progress_loop_with_timeout)
27
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_compat.py", line 30, in run
28
+ return future.result()
29
+ File "/root/miniconda3/envs/protT3/lib/python3.10/concurrent/futures/_base.py", line 440, in result
30
+ self._condition.wait(timeout)
31
+ File "/root/miniconda3/envs/protT3/lib/python3.10/threading.py", line 320, in wait
32
+ waiter.acquire()
33
+ KeyboardInterrupt
34
+ 2025-06-28 23:52:00,349 INFO MsgRouterThr:906089 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
35
+ 2025-06-28 23:52:21,541 INFO Thread-3 (wrapped_target):906089 [retry.py:__call__():175] [no run ID] Retry attempt failed:
36
+ Traceback (most recent call last):
37
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 198, in _new_conn
38
+ sock = connection.create_connection(
39
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
40
+ raise err
41
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
42
+ sock.connect(sa)
43
+ TimeoutError: timed out
44
+
45
+ The above exception was the direct cause of the following exception:
46
+
47
+ Traceback (most recent call last):
48
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
49
+ response = self._make_request(
50
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 488, in _make_request
51
+ raise new_e
52
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 464, in _make_request
53
+ self._validate_conn(conn)
54
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1093, in _validate_conn
55
+ conn.connect()
56
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 704, in connect
57
+ self.sock = sock = self._new_conn()
58
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 207, in _new_conn
59
+ raise ConnectTimeoutError(
60
+ urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f9f447cebc0>, 'Connection to api.wandb.ai timed out. (connect timeout=20)')
61
+
62
+ The above exception was the direct cause of the following exception:
63
+
64
+ Traceback (most recent call last):
65
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
66
+ resp = conn.urlopen(
67
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
68
+ retries = retries.increment(
69
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
70
+ raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
71
+ urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f9f447cebc0>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
72
+
73
+ During handling of the above exception, another exception occurred:
74
+
75
+ Traceback (most recent call last):
76
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 134, in __call__
77
+ result = self._call_fn(*args, **kwargs)
78
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 398, in execute
79
+ return self.client.execute(*args, **kwargs) # type: ignore
80
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute
81
+ result = self._get_result(document, *args, **kwargs)
82
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result
83
+ return self.transport.execute(document, *args, **kwargs)
84
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/gql_request.py", line 58, in execute
85
+ request = self.session.post(self.url, **post_args)
86
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 637, in post
87
+ return self.request("POST", url, data=data, json=json, **kwargs)
88
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
89
+ resp = self.send(prep, **send_kwargs)
90
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
91
+ r = adapter.send(request, **kwargs)
92
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 688, in send
93
+ raise ConnectTimeout(e, request=request)
94
+ requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f9f447cebc0>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235150-ln8ma2mo/run-ln8ma2mo.wandb ADDED
Binary file (402 Bytes). View file
 
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/files/output.log ADDED
File without changes
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-internal.log ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T23:53:30.951932242+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-core.log"}
2
+ {"time":"2025-06-28T23:54:01.056320508+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
3
+ {"time":"2025-06-28T23:54:33.136125443+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
4
+ {"time":"2025-06-28T23:55:07.262654908+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
5
+ {"time":"2025-06-28T23:55:47.196366157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
6
+ {"time":"2025-06-28T23:56:33.830440313+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
7
+ {"time":"2025-06-28T23:57:41.624410258+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
8
+ {"time":"2025-06-28T23:59:11.626386567+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
9
+ {"time":"2025-06-29T00:00:41.62885613+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
10
+ {"time":"2025-06-29T00:02:11.630480713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-06-29T00:03:41.632896773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-06-29T00:05:11.634787095+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
13
+ {"time":"2025-06-29T00:06:41.637469615+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
14
+ {"time":"2025-06-29T00:08:11.640346542+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
15
+ {"time":"2025-06-29T00:09:41.642435888+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
16
+ {"time":"2025-06-29T00:11:11.643824358+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
17
+ {"time":"2025-06-29T00:12:41.645847752+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
18
+ {"time":"2025-06-29T00:14:11.647375061+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
19
+ {"time":"2025-06-29T00:15:41.649750172+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
20
+ {"time":"2025-06-29T00:17:11.652403406+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
21
+ {"time":"2025-06-29T00:18:41.655408766+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
22
+ {"time":"2025-06-29T00:20:11.657578168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
23
+ {"time":"2025-06-29T00:20:11.658626379+08:00","level":"ERROR","msg":"Failed to load features, feature will default to disabled","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
24
+ {"time":"2025-06-29T00:20:11.681430338+08:00","level":"INFO","msg":"created new stream","id":"mz3ej8ig"}
25
+ {"time":"2025-06-29T00:20:11.681462583+08:00","level":"INFO","msg":"stream: started","id":"mz3ej8ig"}
26
+ {"time":"2025-06-29T00:20:11.681507089+08:00","level":"INFO","msg":"sender: started","stream_id":"mz3ej8ig"}
27
+ {"time":"2025-06-29T00:20:11.68149584+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"mz3ej8ig"}
28
+ {"time":"2025-06-29T00:20:11.681547365+08:00","level":"INFO","msg":"handler: started","stream_id":"mz3ej8ig"}
29
+ {"time":"2025-06-29T00:20:11.682347217+08:00","level":"INFO","msg":"stream: closing","id":"mz3ej8ig"}
30
+ {"time":"2025-06-29T00:20:41.686911252+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T00:21:14.167970232+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
32
+ {"time":"2025-06-29T00:21:48.35484514+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
33
+ {"time":"2025-06-29T00:22:27.366864931+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
34
+ {"time":"2025-06-29T00:23:15.010485407+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
35
+ {"time":"2025-06-29T00:24:21.797181116+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
36
+ {"time":"2025-06-29T00:25:51.799191068+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
37
+ {"time":"2025-06-29T00:27:21.801584003+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
38
+ {"time":"2025-06-29T00:28:51.804025943+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
39
+ {"time":"2025-06-29T00:30:11.686396042+08:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000441033,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"5dtwv5qkpd53\" always_send:true connection_id:\"127.0.0.1:33438\")"}
40
+ {"time":"2025-06-29T00:30:21.806550019+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
41
+ {"time":"2025-06-29T00:31:51.807720521+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
42
+ {"time":"2025-06-29T00:33:21.80993678+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
43
+ {"time":"2025-06-29T00:34:51.811863783+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
44
+ {"time":"2025-06-29T00:36:21.813546407+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
45
+ {"time":"2025-06-29T00:37:51.815079734+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
46
+ {"time":"2025-06-29T00:39:21.816778971+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
47
+ {"time":"2025-06-29T00:40:11.688234649+08:00","level":"WARN","msg":"sender: taking a long time","seconds":1200.002273841,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"5dtwv5qkpd53\" always_send:true connection_id:\"127.0.0.1:33438\")"}
48
+ {"time":"2025-06-29T00:40:51.81937773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
49
+ {"time":"2025-06-29T00:42:21.820944763+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
50
+ {"time":"2025-06-29T00:43:51.823708778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
51
+ {"time":"2025-06-29T00:45:21.825965407+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
52
+ {"time":"2025-06-29T00:46:51.827707527+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
53
+ {"time":"2025-06-29T00:46:51.828793803+08:00","level":"ERROR","msg":"send: sendRun: failed to update run state: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
54
+ {"time":"2025-06-29T00:46:51.829193166+08:00","level":"ERROR","msg":"sender: upsertConfig: RunRecord is nil"}
55
+ {"time":"2025-06-29T00:46:51.82925862+08:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1600.143330565,"work":"WorkRecord(*service_go_proto.Record_Run); Control(mailbox_slot:\"5dtwv5qkpd53\" always_send:true connection_id:\"127.0.0.1:33438\")"}
56
+ {"time":"2025-06-29T00:47:21.884399165+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
57
+ {"time":"2025-06-29T00:47:54.221719208+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
58
+ {"time":"2025-06-29T00:48:28.514834678+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
59
+ {"time":"2025-06-29T00:49:08.340678437+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
60
+ {"time":"2025-06-29T00:49:57.755779717+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
61
+ {"time":"2025-06-29T00:51:02.131711303+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
62
+ {"time":"2025-06-29T00:52:32.133820589+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
63
+ {"time":"2025-06-29T00:54:02.13693015+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
64
+ {"time":"2025-06-29T00:55:32.139789811+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
65
+ {"time":"2025-06-29T00:57:02.14632236+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
66
+ {"time":"2025-06-29T00:58:32.148909001+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
67
+ {"time":"2025-06-29T01:00:02.154970223+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
68
+ {"time":"2025-06-29T01:01:32.157829588+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
69
+ {"time":"2025-06-29T01:03:02.159580228+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
70
+ {"time":"2025-06-29T01:04:32.161980354+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
71
+ {"time":"2025-06-29T01:06:02.163980101+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
72
+ {"time":"2025-06-29T01:07:32.166420035+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
73
+ {"time":"2025-06-29T01:09:02.168842314+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
74
+ {"time":"2025-06-29T01:10:32.171375938+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
75
+ {"time":"2025-06-29T01:12:02.17356805+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
76
+ {"time":"2025-06-29T01:13:32.174403974+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
77
+ {"time":"2025-06-29T01:13:32.175440214+08:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s): Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
78
+ {"time":"2025-06-29T01:13:32.175867374+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
79
+ {"time":"2025-06-29T01:13:32.175926547+08:00","level":"INFO","msg":"handler: closed","stream_id":"mz3ej8ig"}
80
+ {"time":"2025-06-29T01:13:32.175943588+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"mz3ej8ig"}
81
+ {"time":"2025-06-29T01:13:32.176007588+08:00","level":"INFO","msg":"sender: closed","stream_id":"mz3ej8ig"}
82
+ {"time":"2025-06-29T01:13:32.180310341+08:00","level":"INFO","msg":"stream: closed","id":"mz3ej8ig"}
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug.log ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Configure stats pid to 907243
3
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug.log
7
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/logs/debug-internal.log
8
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 23:53:30,942 INFO MainThread:907243 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 23:53:30,943 INFO MainThread:907243 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 23:53:30,943 INFO MainThread:907243 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 23:53:30,944 INFO MainThread:907243 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 23:53:30,946 INFO MainThread:907243 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 23:53:30,950 INFO MainThread:907243 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 23:53:30,953 INFO MainThread:907243 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 23:54:01,206 WARNING MainThread:907243 [wandb_init.py:init():1681] [no run ID] interrupted
18
+ Traceback (most recent call last):
19
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1677, in init
20
+ return wi.init(run_settings, run_config, run_printer)
21
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1055, in init
22
+ result = wait_with_progress(
23
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
24
+ return wait_all_with_progress(
25
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
26
+ return asyncio_compat.run(progress_loop_with_timeout)
27
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_compat.py", line 30, in run
28
+ return future.result()
29
+ File "/root/miniconda3/envs/protT3/lib/python3.10/concurrent/futures/_base.py", line 440, in result
30
+ self._condition.wait(timeout)
31
+ File "/root/miniconda3/envs/protT3/lib/python3.10/threading.py", line 320, in wait
32
+ waiter.acquire()
33
+ KeyboardInterrupt
34
+ 2025-06-28 23:54:01,916 INFO MsgRouterThr:907243 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
35
+ 2025-06-28 23:54:02,523 INFO Thread-3 (wrapped_target):907243 [retry.py:__call__():175] [no run ID] Retry attempt failed:
36
+ Traceback (most recent call last):
37
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 198, in _new_conn
38
+ sock = connection.create_connection(
39
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
40
+ raise err
41
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
42
+ sock.connect(sa)
43
+ TimeoutError: timed out
44
+
45
+ The above exception was the direct cause of the following exception:
46
+
47
+ Traceback (most recent call last):
48
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
49
+ response = self._make_request(
50
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 488, in _make_request
51
+ raise new_e
52
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 464, in _make_request
53
+ self._validate_conn(conn)
54
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1093, in _validate_conn
55
+ conn.connect()
56
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 704, in connect
57
+ self.sock = sock = self._new_conn()
58
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 207, in _new_conn
59
+ raise ConnectTimeoutError(
60
+ urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f87fd2d6350>, 'Connection to api.wandb.ai timed out. (connect timeout=20)')
61
+
62
+ The above exception was the direct cause of the following exception:
63
+
64
+ Traceback (most recent call last):
65
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
66
+ resp = conn.urlopen(
67
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
68
+ retries = retries.increment(
69
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
70
+ raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
71
+ urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f87fd2d6350>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
72
+
73
+ During handling of the above exception, another exception occurred:
74
+
75
+ Traceback (most recent call last):
76
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 134, in __call__
77
+ result = self._call_fn(*args, **kwargs)
78
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 398, in execute
79
+ return self.client.execute(*args, **kwargs) # type: ignore
80
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute
81
+ result = self._get_result(document, *args, **kwargs)
82
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result
83
+ return self.transport.execute(document, *args, **kwargs)
84
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/gql_request.py", line 58, in execute
85
+ request = self.session.post(self.url, **post_args)
86
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 637, in post
87
+ return self.request("POST", url, data=data, json=json, **kwargs)
88
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
89
+ resp = self.send(prep, **send_kwargs)
90
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
91
+ r = adapter.send(request, **kwargs)
92
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 688, in send
93
+ raise ConnectTimeout(e, request=request)
94
+ requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f87fd2d6350>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235330-mz3ej8ig/run-mz3ej8ig.wandb ADDED
Binary file (402 Bytes). View file
 
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/config.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 33
39
+ - 55
40
+ - 66
41
+ "4": 3.10.0
42
+ "5": 0.19.11
43
+ "6": 4.52.3
44
+ "8":
45
+ - 5
46
+ "12": 0.19.11
47
+ "13": linux-x86_64
48
+ accelerator:
49
+ value: gpu
50
+ batch_size:
51
+ value: 96
52
+ bert_hidden_dim:
53
+ value: 768
54
+ bert_name:
55
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
56
+ check_val_every_n_epoch:
57
+ value: 1
58
+ cross_attention_freq:
59
+ value: 2
60
+ devices:
61
+ value: 0,1,2,3,4,5,6,7
62
+ filename:
63
+ value: stage1_06282348_ddp
64
+ init_checkpoint:
65
+ value: ""
66
+ init_lr:
67
+ value: 0.0001
68
+ lm:
69
+ value: true
70
+ load_4bit:
71
+ value: false
72
+ lr_decay_rate:
73
+ value: 0.9
74
+ match_batch_size:
75
+ value: 64
76
+ max_epochs:
77
+ value: 20
78
+ min_lr:
79
+ value: 1e-05
80
+ mix_dataset:
81
+ value: true
82
+ mode:
83
+ value: train
84
+ num_query_token:
85
+ value: 8
86
+ num_workers:
87
+ value: 8
88
+ plm_name:
89
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
90
+ plm_tune:
91
+ value: freeze
92
+ pool_size:
93
+ value: 0
94
+ precision:
95
+ value: bf16-mixed
96
+ projection_dim:
97
+ value: 256
98
+ prot_aug:
99
+ value: None
100
+ prot_max_len:
101
+ value: 1024
102
+ ptm:
103
+ value: true
104
+ rerank_cand_num:
105
+ value: 128
106
+ retrieval_eval_epoch:
107
+ value: 10
108
+ root:
109
+ value: data
110
+ save_every_n_epochs:
111
+ value: 5
112
+ scheduler:
113
+ value: linear_warmup_cosine_lr
114
+ seed:
115
+ value: 42
116
+ strategy:
117
+ value: ddp
118
+ temperature:
119
+ value: 0.1
120
+ text_max_len:
121
+ value: 128
122
+ use_wandb_logger:
123
+ value: true
124
+ warmup_lr:
125
+ value: 1e-06
126
+ warmup_steps:
127
+ value: 1000
128
+ weight_decay:
129
+ value: 0.05
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/output.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ W0629 00:01:34.010646 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 908727 via signal SIGTERM
2
+ W0629 00:01:34.011594 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 908857 via signal SIGTERM
3
+ W0629 00:01:34.011932 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909000 via signal SIGTERM
4
+ W0629 00:01:34.012163 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909139 via signal SIGTERM
5
+ W0629 00:01:34.012400 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909288 via signal SIGTERM
6
+ W0629 00:01:34.012614 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909427 via signal SIGTERM
7
+ W0629 00:01:34.012998 908035 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 909567 via signal SIGTERM
8
+ Traceback (most recent call last):
9
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 135, in <module>
10
+ main(args)
11
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 101, in main
12
+ trainer.fit(model, datamodule=dm)
13
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
14
+ call._call_and_handle_interrupt(
15
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
16
+ return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
17
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 144, in launch
18
+ while not process_context.join():
19
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 215, in join
20
+ raise ProcessRaisedException(msg, error_index, failed_process.pid)
21
+ torch.multiprocessing.spawn.ProcessRaisedException: It looks like your LightningModule has parameters that were not used in producing the loss returned by training_step. If this is intentional, you must enable the detection of unused parameters in DDP, either by setting the string value `strategy='ddp_find_unused_parameters_true'` or by setting the flag in the strategy with `strategy=DDPStrategy(find_unused_parameters=True)`.
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stack-data==0.6.3
2
+ yarl==1.20.0
3
+ setuptools==78.1.1
4
+ cloudpathlib==0.21.1
5
+ pytz==2025.2
6
+ nvidia-cufile-cu12==1.11.1.6
7
+ shellingham==1.5.4
8
+ nvidia-cusolver-cu12==11.6.1.9
9
+ Jinja2==3.1.6
10
+ pycocotools==2.0.8
11
+ pandas==2.2.3
12
+ scipy==1.15.3
13
+ tenacity==9.1.2
14
+ lightning-utilities==0.14.3
15
+ cfgv==3.4.0
16
+ hf-xet==1.1.2
17
+ platformdirs==4.3.8
18
+ smart-open==7.1.0
19
+ text-unidecode==1.3
20
+ nvidia-cublas-cu12==12.4.5.8
21
+ omegaconf==2.3.0
22
+ referencing==0.36.2
23
+ mdurl==0.1.2
24
+ gitdb==4.0.12
25
+ identify==2.6.12
26
+ ipython==8.36.0
27
+ spacy-loggers==1.0.5
28
+ distlib==0.3.9
29
+ typing-inspection==0.4.1
30
+ antlr4-python3-runtime==4.9.3
31
+ multidict==6.4.4
32
+ nvidia-curand-cu12==10.3.5.147
33
+ prompt_toolkit==3.0.51
34
+ Pygments==2.19.1
35
+ numpy==2.2.6
36
+ decord==0.6.0
37
+ srsly==2.5.1
38
+ watchdog==6.0.0
39
+ pure_eval==0.2.3
40
+ virtualenv==20.31.2
41
+ altair==5.5.0
42
+ matplotlib-inline==0.1.7
43
+ bleach==6.2.0
44
+ exceptiongroup==1.3.0
45
+ fairscale==0.4.4
46
+ confection==0.1.5
47
+ fonttools==4.58.0
48
+ nvidia-cuda-nvrtc-cu12==12.4.127
49
+ ptyprocess==0.7.0
50
+ pytorch-lightning==2.5.1.post0
51
+ nodeenv==1.9.1
52
+ nvidia-cudnn-cu12==9.1.0.70
53
+ requests==2.32.3
54
+ marisa-trie==1.2.1
55
+ cachetools==5.5.2
56
+ matplotlib==3.10.3
57
+ typing_extensions==4.13.2
58
+ asttokens==3.0.0
59
+ torch==2.6.0
60
+ PyYAML==6.0.2
61
+ tifffile==2025.5.10
62
+ spacy==3.8.7
63
+ braceexpand==0.1.7
64
+ plotly==6.1.1
65
+ attrs==25.3.0
66
+ py-cpuinfo==9.0.0
67
+ frozenlist==1.6.0
68
+ catalogue==2.0.10
69
+ nvidia-cusparselt-cu12==0.6.2
70
+ traitlets==5.14.3
71
+ annotated-types==0.7.0
72
+ language_data==1.3.0
73
+ thinc==8.3.6
74
+ imageio==2.37.0
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ certifi==2025.4.26
77
+ smmap==5.0.2
78
+ python-magic==0.4.27
79
+ triton==3.2.0
80
+ weasel==0.4.1
81
+ async-timeout==5.0.1
82
+ wcwidth==0.2.13
83
+ pillow==11.2.1
84
+ torchmetrics==1.7.1
85
+ kaggle==1.7.4.5
86
+ regex==2024.11.6
87
+ aiosignal==1.3.2
88
+ nvidia-cusparse-cu12==12.3.1.170
89
+ scikit-image==0.25.2
90
+ nvidia-nvtx-cu12==12.4.127
91
+ opendatasets==0.1.22
92
+ iopath==0.1.10
93
+ pyparsing==3.2.3
94
+ portalocker==3.1.1
95
+ executing==2.2.0
96
+ contexttimer==0.3.3
97
+ lazy_loader==0.4
98
+ wrapt==1.17.2
99
+ webdataset==0.2.111
100
+ blis==1.3.0
101
+ idna==3.10
102
+ timm==0.4.12
103
+ einops==0.8.1
104
+ packaging==24.2
105
+ decorator==5.2.1
106
+ filelock==3.18.0
107
+ python-slugify==8.0.4
108
+ cycler==0.12.1
109
+ charset-normalizer==3.4.2
110
+ pydantic==2.11.5
111
+ pydeck==0.9.1
112
+ tzdata==2025.2
113
+ jedi==0.19.2
114
+ aiohappyeyeballs==2.6.1
115
+ nvidia-nvjitlink-cu12==12.4.127
116
+ salesforce-lavis==1.0.2
117
+ parso==0.8.4
118
+ nvidia-nccl-cu12==2.21.5
119
+ toml==0.10.2
120
+ python-dateutil==2.9.0.post0
121
+ rich==14.0.0
122
+ tqdm==4.67.1
123
+ rpds-py==0.25.1
124
+ opencv-python-headless==4.5.5.64
125
+ tornado==6.5.1
126
+ propcache==0.3.1
127
+ webencodings==0.5.1
128
+ murmurhash==1.0.13
129
+ contourpy==1.3.2
130
+ joblib==1.5.1
131
+ networkx==3.4.2
132
+ six==1.17.0
133
+ markdown-it-py==3.0.0
134
+ nvidia-cuda-cupti-cu12==12.4.127
135
+ msgpack==1.1.0
136
+ sentencepiece==0.2.0
137
+ cymem==2.0.11
138
+ nvidia-cufft-cu12==11.2.1.3
139
+ absl-py==2.2.2
140
+ hjson==3.1.0
141
+ mpmath==1.3.0
142
+ pydantic_core==2.33.2
143
+ psutil==7.0.0
144
+ nvidia-ml-py==12.575.51
145
+ pyarrow==20.0.0
146
+ kiwisolver==1.4.8
147
+ sympy==1.13.1
148
+ ninja==1.11.1.4
149
+ rouge_score==0.1.2
150
+ deepspeed==0.16.10+b666844f
151
+ spacy-legacy==3.0.12
152
+ pycocoevalcap==1.2
153
+ pexpect==4.9.0
154
+ ftfy==6.3.1
155
+ protobuf==6.31.0
156
+ urllib3==2.4.0
157
+ wheel==0.45.1
158
+ nltk==3.9.1
159
+ streamlit==1.45.1
160
+ wasabi==1.1.3
161
+ pre_commit==4.2.0
162
+ safetensors==0.5.3
163
+ jsonschema-specifications==2025.4.1
164
+ langcodes==3.5.0
165
+ GitPython==3.1.44
166
+ blinker==1.9.0
167
+ torchvision==0.21.0
168
+ MarkupSafe==3.0.2
169
+ dill==0.3.8
170
+ yacs==0.1.8
171
+ pathlib==1.0.1
172
+ scikit-learn==1.6.1
173
+ cffi==1.17.1
174
+ pycparser==2.22
175
+ flash-attn==2.7.1.post1
176
+ cryptography==45.0.3
177
+ pycryptodome==3.23.0
178
+ cheroot==10.0.1
179
+ more-itertools==10.7.0
180
+ setproctitle==1.3.6
181
+ delta-center-client==0.0.4
182
+ jmespath==0.10.0
183
+ xxhash==3.5.0
184
+ pip==25.1.1
185
+ aliyun-python-sdk-core==2.16.0
186
+ jaraco.functools==4.1.0
187
+ bigmodelvis==0.0.1
188
+ aiohttp==3.12.2
189
+ multiprocess==0.70.16
190
+ opendelta==0.3.2
191
+ docker-pycreds==0.4.0
192
+ threadpoolctl==3.6.0
193
+ click==8.2.1
194
+ oss2==2.15.0
195
+ crcmod==1.7
196
+ transformers==4.52.3
197
+ datasets==3.6.0
198
+ jsonschema==4.24.0
199
+ opencv-python==4.11.0.86
200
+ wandb==0.19.11
201
+ fsspec==2025.3.0
202
+ tokenizers==0.21.1
203
+ sentry-sdk==2.29.1
204
+ preshed==3.0.10
205
+ aliyun-python-sdk-kms==2.16.5
206
+ huggingface-hub==0.32.1
207
+ typer==0.16.0
208
+ narwhals==1.41.0
209
+ web.py==0.62
210
+ autocommand==2.2.2
211
+ importlib_metadata==8.0.0
212
+ zipp==3.19.2
213
+ jaraco.context==5.3.0
214
+ typeguard==4.3.0
215
+ jaraco.collections==5.1.0
216
+ typing_extensions==4.12.2
217
+ backports.tarfile==1.2.0
218
+ jaraco.functools==4.0.1
219
+ more-itertools==10.3.0
220
+ platformdirs==4.2.2
221
+ packaging==24.2
222
+ tomli==2.0.1
223
+ jaraco.text==3.12.1
224
+ wheel==0.45.1
225
+ inflect==7.3.1
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-metadata.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T15:54:48.951303Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06282348_ddp",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "96",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger",
30
+ "--strategy",
31
+ "ddp"
32
+ ],
33
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
34
+ "codePath": "stage1.py",
35
+ "root": "./all_checkpoints/stage1_06282348_ddp/",
36
+ "host": "dsw-265304-57b7b77cbc-vwbwc",
37
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
38
+ "codePathLocal": "stage1.py",
39
+ "cpu_count": 64,
40
+ "cpu_count_logical": 64,
41
+ "gpu": "NVIDIA A800-SXM4-80GB",
42
+ "gpu_count": 8,
43
+ "disk": {
44
+ "/": {
45
+ "total": "1623302262784",
46
+ "used": "1285226496"
47
+ }
48
+ },
49
+ "memory": {
50
+ "total": "549755813888"
51
+ },
52
+ "cpu": {
53
+ "count": 64,
54
+ "countLogical": 64
55
+ },
56
+ "gpu_nvidia": [
57
+ {
58
+ "name": "NVIDIA A800-SXM4-80GB",
59
+ "memoryTotal": "85198045184",
60
+ "architecture": "Ampere"
61
+ },
62
+ {
63
+ "name": "NVIDIA A800-SXM4-80GB",
64
+ "memoryTotal": "85198045184",
65
+ "architecture": "Ampere"
66
+ },
67
+ {
68
+ "name": "NVIDIA A800-SXM4-80GB",
69
+ "memoryTotal": "85198045184",
70
+ "architecture": "Ampere"
71
+ },
72
+ {
73
+ "name": "NVIDIA A800-SXM4-80GB",
74
+ "memoryTotal": "85198045184",
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA A800-SXM4-80GB",
79
+ "memoryTotal": "85198045184",
80
+ "architecture": "Ampere"
81
+ },
82
+ {
83
+ "name": "NVIDIA A800-SXM4-80GB",
84
+ "memoryTotal": "85198045184",
85
+ "architecture": "Ampere"
86
+ },
87
+ {
88
+ "name": "NVIDIA A800-SXM4-80GB",
89
+ "memoryTotal": "85198045184",
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A800-SXM4-80GB",
94
+ "memoryTotal": "85198045184",
95
+ "architecture": "Ampere"
96
+ }
97
+ ],
98
+ "cudaVersion": "12.1"
99
+ }
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":408}}
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T23:54:48.952903363+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-core.log"}
2
+ {"time":"2025-06-28T23:55:08.706410301+08:00","level":"INFO","msg":"created new stream","id":"d21a8n96"}
3
+ {"time":"2025-06-28T23:55:08.70719709+08:00","level":"INFO","msg":"stream: started","id":"d21a8n96"}
4
+ {"time":"2025-06-28T23:55:08.707235941+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"d21a8n96"}
5
+ {"time":"2025-06-28T23:55:08.707321639+08:00","level":"INFO","msg":"handler: started","stream_id":"d21a8n96"}
6
+ {"time":"2025-06-28T23:55:08.707259857+08:00","level":"INFO","msg":"sender: started","stream_id":"d21a8n96"}
7
+ {"time":"2025-06-28T23:55:15.734662691+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-28T23:58:30.738393146+08:00","level":"ERROR","msg":"filestream: json decode error: context deadline exceeded (Client.Timeout or context cancellation while reading body)"}
9
+ {"time":"2025-06-28T23:58:30.746811098+08:00","level":"ERROR","msg":"filestream: error closing response body: net/http: request canceled"}
10
+ {"time":"2025-06-29T00:01:37.883490068+08:00","level":"INFO","msg":"stream: closing","id":"d21a8n96"}
11
+ {"time":"2025-06-29T00:01:37.883589533+08:00","level":"INFO","msg":"Stopping system monitor"}
12
+ {"time":"2025-06-29T00:01:37.884424806+08:00","level":"INFO","msg":"Stopped system monitor"}
13
+ {"time":"2025-06-29T00:01:42.08475624+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
14
+ {"time":"2025-06-29T00:02:28.146748335+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/d21a8n96/file_stream\": read tcp 10.1.6.17:45786->172.67.193.61:443: read: connection timed out"}
15
+ {"time":"2025-06-29T00:02:54.866501576+08:00","level":"INFO","msg":"handler: closed","stream_id":"d21a8n96"}
16
+ {"time":"2025-06-29T00:02:54.86656561+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"d21a8n96"}
17
+ {"time":"2025-06-29T00:02:54.86658917+08:00","level":"INFO","msg":"sender: closed","stream_id":"d21a8n96"}
18
+ {"time":"2025-06-29T00:02:54.871635053+08:00","level":"INFO","msg":"stream: closed","id":"d21a8n96"}
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Configure stats pid to 908035
3
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug.log
7
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/logs/debug-internal.log
8
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 23:54:48,944 INFO MainThread:908035 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 23:54:48,946 INFO MainThread:908035 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 23:54:48,948 INFO MainThread:908035 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 23:54:48,952 INFO MainThread:908035 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 23:54:48,955 INFO MainThread:908035 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 23:55:15,683 INFO MainThread:908035 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-28 23:55:15,875 INFO MainThread:908035 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-28 23:55:15,875 INFO MainThread:908035 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-28 23:55:15,879 INFO MainThread:908035 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-28 23:55:15,879 INFO MainThread:908035 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-28 23:55:15,880 INFO MainThread:908035 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-29 00:01:37,881 INFO MsgRouterThr:908035 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250628_235448-d21a8n96/run-d21a8n96.wandb ADDED
Binary file (91.6 kB). View file
 
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/files/output.log ADDED
File without changes
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-29T00:04:21.83840344+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-core.log"}
2
+ {"time":"2025-06-29T00:04:46.244498134+08:00","level":"INFO","msg":"created new stream","id":"y2lylvs5"}
3
+ {"time":"2025-06-29T00:04:46.245490243+08:00","level":"INFO","msg":"stream: started","id":"y2lylvs5"}
4
+ {"time":"2025-06-29T00:04:46.245519608+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"y2lylvs5"}
5
+ {"time":"2025-06-29T00:04:46.245575002+08:00","level":"INFO","msg":"handler: started","stream_id":"y2lylvs5"}
6
+ {"time":"2025-06-29T00:04:46.245579305+08:00","level":"INFO","msg":"sender: started","stream_id":"y2lylvs5"}
7
+ {"time":"2025-06-29T00:05:16.250479528+08:00","level":"ERROR","msg":"send: sendRun: failed to update run state: context deadline exceeded (Client.Timeout or context cancellation while reading body)"}
8
+ {"time":"2025-06-29T00:05:17.262869937+08:00","level":"INFO","msg":"stream: closing","id":"y2lylvs5"}
9
+ {"time":"2025-06-29T00:05:17.262985061+08:00","level":"ERROR","msg":"sender: upsertConfig: RunRecord is nil"}
10
+ {"time":"2025-06-29T00:05:22.061017174+08:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.bandw.top/graphql"}
11
+ {"time":"2025-06-29T00:05:22.061118361+08:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404: {\"data\":{\"createRunFiles\":null},\"errors\":[{\"message\":\"project /stage1_06282348_ddp not found during createRunFiles\",\"path\":[\"createRunFiles\"]}]}"}
12
+ {"time":"2025-06-29T00:05:22.066030873+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-06-29T00:05:22.067356057+08:00","level":"INFO","msg":"handler: closed","stream_id":"y2lylvs5"}
14
+ {"time":"2025-06-29T00:05:22.067390632+08:00","level":"INFO","msg":"sender: closed","stream_id":"y2lylvs5"}
15
+ {"time":"2025-06-29T00:05:22.067390315+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"y2lylvs5"}
16
+ {"time":"2025-06-29T00:05:22.070426682+08:00","level":"INFO","msg":"stream: closed","id":"y2lylvs5"}
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Configure stats pid to 937487
3
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug.log
7
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/logs/debug-internal.log
8
+ 2025-06-29 00:04:21,827 INFO MainThread:937487 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-29 00:04:21,828 INFO MainThread:937487 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-29 00:04:21,828 INFO MainThread:937487 [wandb_init.py:init():893] starting backend
12
+ 2025-06-29 00:04:21,828 INFO MainThread:937487 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-29 00:04:21,829 INFO MainThread:937487 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-29 00:04:21,832 INFO MainThread:937487 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-29 00:04:21,838 INFO MainThread:937487 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-29 00:04:21,842 INFO MainThread:937487 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000421-y2lylvs5/run-y2lylvs5.wandb ADDED
Binary file (404 Bytes). View file
 
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/config.yaml ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": lr
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": trainer/global_step
12
+ "6":
13
+ - 3
14
+ "7": []
15
+ - "1": loader0/val_loss_ptm/dataloader_idx_0
16
+ "5": 2
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": loader2/val_loss/dataloader_idx_2
22
+ "5": 2
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": loader2/val_loss_lm/dataloader_idx_2
28
+ "5": 2
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": epoch
34
+ "5": 2
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": loader2/val_loss_ptc/dataloader_idx_2
40
+ "5": 2
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": loader0/val_loss_ptc/dataloader_idx_0
46
+ "5": 2
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": train_loss_lm
52
+ "5": 2
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": train_loss
58
+ "5": 2
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": loader1/val_loss_ptc/dataloader_idx_1
64
+ "5": 2
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": loader1/val_loss_ptm/dataloader_idx_1
70
+ "5": 2
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": loader0/val_loss/dataloader_idx_0
76
+ "5": 2
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ - "1": loader0/val_loss_lm/dataloader_idx_0
82
+ "5": 2
83
+ "6":
84
+ - 1
85
+ - 3
86
+ "7": []
87
+ - "1": train_loss_ptm
88
+ "5": 2
89
+ "6":
90
+ - 1
91
+ - 3
92
+ "7": []
93
+ - "1": train_loss_ptc
94
+ "5": 2
95
+ "6":
96
+ - 1
97
+ - 3
98
+ "7": []
99
+ - "1": loader1/val_loss_lm/dataloader_idx_1
100
+ "5": 2
101
+ "6":
102
+ - 1
103
+ - 3
104
+ "7": []
105
+ - "1": loader1/val_loss/dataloader_idx_1
106
+ "5": 2
107
+ "6":
108
+ - 1
109
+ - 3
110
+ "7": []
111
+ - "1": loader2/val_loss_ptm/dataloader_idx_2
112
+ "5": 2
113
+ "6":
114
+ - 1
115
+ - 3
116
+ "7": []
117
+ python_version: 3.10.0
118
+ t:
119
+ "1":
120
+ - 1
121
+ - 5
122
+ - 9
123
+ - 11
124
+ - 33
125
+ - 41
126
+ - 49
127
+ - 53
128
+ - 55
129
+ - 63
130
+ - 103
131
+ "2":
132
+ - 1
133
+ - 5
134
+ - 9
135
+ - 11
136
+ - 33
137
+ - 41
138
+ - 49
139
+ - 53
140
+ - 55
141
+ - 63
142
+ - 103
143
+ "3":
144
+ - 7
145
+ - 23
146
+ - 33
147
+ - 55
148
+ - 66
149
+ "4": 3.10.0
150
+ "5": 0.19.11
151
+ "6": 4.52.3
152
+ "8":
153
+ - 5
154
+ "12": 0.19.11
155
+ "13": linux-x86_64
156
+ accelerator:
157
+ value: gpu
158
+ batch_size:
159
+ value: 96
160
+ bert_hidden_dim:
161
+ value: 768
162
+ bert_name:
163
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
164
+ check_val_every_n_epoch:
165
+ value: 1
166
+ cross_attention_freq:
167
+ value: 2
168
+ devices:
169
+ value: 0,1,2,3,4,5,6,7
170
+ filename:
171
+ value: stage1_06282348_ddp
172
+ init_checkpoint:
173
+ value: ""
174
+ init_lr:
175
+ value: 0.0001
176
+ lm:
177
+ value: true
178
+ load_4bit:
179
+ value: false
180
+ lr_decay_rate:
181
+ value: 0.9
182
+ match_batch_size:
183
+ value: 64
184
+ max_epochs:
185
+ value: 20
186
+ min_lr:
187
+ value: 1e-05
188
+ mix_dataset:
189
+ value: true
190
+ mode:
191
+ value: train
192
+ num_query_token:
193
+ value: 8
194
+ num_workers:
195
+ value: 8
196
+ plm_name:
197
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
198
+ plm_tune:
199
+ value: freeze
200
+ pool_size:
201
+ value: 0
202
+ precision:
203
+ value: bf16-mixed
204
+ projection_dim:
205
+ value: 256
206
+ prot_aug:
207
+ value: None
208
+ prot_max_len:
209
+ value: 1024
210
+ ptm:
211
+ value: true
212
+ rerank_cand_num:
213
+ value: 128
214
+ retrieval_eval_epoch:
215
+ value: 10
216
+ root:
217
+ value: data
218
+ save_every_n_epochs:
219
+ value: 5
220
+ scheduler:
221
+ value: linear_warmup_cosine_lr
222
+ seed:
223
+ value: 42
224
+ strategy:
225
+ value: ddp
226
+ temperature:
227
+ value: 0.1
228
+ text_max_len:
229
+ value: 128
230
+ use_wandb_logger:
231
+ value: true
232
+ warmup_lr:
233
+ value: 1e-06
234
+ warmup_steps:
235
+ value: 1000
236
+ weight_decay:
237
+ value: 0.05
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/output.log ADDED
File without changes
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stack-data==0.6.3
2
+ yarl==1.20.0
3
+ setuptools==78.1.1
4
+ cloudpathlib==0.21.1
5
+ pytz==2025.2
6
+ nvidia-cufile-cu12==1.11.1.6
7
+ shellingham==1.5.4
8
+ nvidia-cusolver-cu12==11.6.1.9
9
+ Jinja2==3.1.6
10
+ pycocotools==2.0.8
11
+ pandas==2.2.3
12
+ scipy==1.15.3
13
+ tenacity==9.1.2
14
+ lightning-utilities==0.14.3
15
+ cfgv==3.4.0
16
+ hf-xet==1.1.2
17
+ platformdirs==4.3.8
18
+ smart-open==7.1.0
19
+ text-unidecode==1.3
20
+ nvidia-cublas-cu12==12.4.5.8
21
+ omegaconf==2.3.0
22
+ referencing==0.36.2
23
+ mdurl==0.1.2
24
+ gitdb==4.0.12
25
+ identify==2.6.12
26
+ ipython==8.36.0
27
+ spacy-loggers==1.0.5
28
+ distlib==0.3.9
29
+ typing-inspection==0.4.1
30
+ antlr4-python3-runtime==4.9.3
31
+ multidict==6.4.4
32
+ nvidia-curand-cu12==10.3.5.147
33
+ prompt_toolkit==3.0.51
34
+ Pygments==2.19.1
35
+ numpy==2.2.6
36
+ decord==0.6.0
37
+ srsly==2.5.1
38
+ watchdog==6.0.0
39
+ pure_eval==0.2.3
40
+ virtualenv==20.31.2
41
+ altair==5.5.0
42
+ matplotlib-inline==0.1.7
43
+ bleach==6.2.0
44
+ exceptiongroup==1.3.0
45
+ fairscale==0.4.4
46
+ confection==0.1.5
47
+ fonttools==4.58.0
48
+ nvidia-cuda-nvrtc-cu12==12.4.127
49
+ ptyprocess==0.7.0
50
+ pytorch-lightning==2.5.1.post0
51
+ nodeenv==1.9.1
52
+ nvidia-cudnn-cu12==9.1.0.70
53
+ requests==2.32.3
54
+ marisa-trie==1.2.1
55
+ cachetools==5.5.2
56
+ matplotlib==3.10.3
57
+ typing_extensions==4.13.2
58
+ asttokens==3.0.0
59
+ torch==2.6.0
60
+ PyYAML==6.0.2
61
+ tifffile==2025.5.10
62
+ spacy==3.8.7
63
+ braceexpand==0.1.7
64
+ plotly==6.1.1
65
+ attrs==25.3.0
66
+ py-cpuinfo==9.0.0
67
+ frozenlist==1.6.0
68
+ catalogue==2.0.10
69
+ nvidia-cusparselt-cu12==0.6.2
70
+ traitlets==5.14.3
71
+ annotated-types==0.7.0
72
+ language_data==1.3.0
73
+ thinc==8.3.6
74
+ imageio==2.37.0
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ certifi==2025.4.26
77
+ smmap==5.0.2
78
+ python-magic==0.4.27
79
+ triton==3.2.0
80
+ weasel==0.4.1
81
+ async-timeout==5.0.1
82
+ wcwidth==0.2.13
83
+ pillow==11.2.1
84
+ torchmetrics==1.7.1
85
+ kaggle==1.7.4.5
86
+ regex==2024.11.6
87
+ aiosignal==1.3.2
88
+ nvidia-cusparse-cu12==12.3.1.170
89
+ scikit-image==0.25.2
90
+ nvidia-nvtx-cu12==12.4.127
91
+ opendatasets==0.1.22
92
+ iopath==0.1.10
93
+ pyparsing==3.2.3
94
+ portalocker==3.1.1
95
+ executing==2.2.0
96
+ contexttimer==0.3.3
97
+ lazy_loader==0.4
98
+ wrapt==1.17.2
99
+ webdataset==0.2.111
100
+ blis==1.3.0
101
+ idna==3.10
102
+ timm==0.4.12
103
+ einops==0.8.1
104
+ packaging==24.2
105
+ decorator==5.2.1
106
+ filelock==3.18.0
107
+ python-slugify==8.0.4
108
+ cycler==0.12.1
109
+ charset-normalizer==3.4.2
110
+ pydantic==2.11.5
111
+ pydeck==0.9.1
112
+ tzdata==2025.2
113
+ jedi==0.19.2
114
+ aiohappyeyeballs==2.6.1
115
+ nvidia-nvjitlink-cu12==12.4.127
116
+ salesforce-lavis==1.0.2
117
+ parso==0.8.4
118
+ nvidia-nccl-cu12==2.21.5
119
+ toml==0.10.2
120
+ python-dateutil==2.9.0.post0
121
+ rich==14.0.0
122
+ tqdm==4.67.1
123
+ rpds-py==0.25.1
124
+ opencv-python-headless==4.5.5.64
125
+ tornado==6.5.1
126
+ propcache==0.3.1
127
+ webencodings==0.5.1
128
+ murmurhash==1.0.13
129
+ contourpy==1.3.2
130
+ joblib==1.5.1
131
+ networkx==3.4.2
132
+ six==1.17.0
133
+ markdown-it-py==3.0.0
134
+ nvidia-cuda-cupti-cu12==12.4.127
135
+ msgpack==1.1.0
136
+ sentencepiece==0.2.0
137
+ cymem==2.0.11
138
+ nvidia-cufft-cu12==11.2.1.3
139
+ absl-py==2.2.2
140
+ hjson==3.1.0
141
+ mpmath==1.3.0
142
+ pydantic_core==2.33.2
143
+ psutil==7.0.0
144
+ nvidia-ml-py==12.575.51
145
+ pyarrow==20.0.0
146
+ kiwisolver==1.4.8
147
+ sympy==1.13.1
148
+ ninja==1.11.1.4
149
+ rouge_score==0.1.2
150
+ deepspeed==0.16.10+b666844f
151
+ spacy-legacy==3.0.12
152
+ pycocoevalcap==1.2
153
+ pexpect==4.9.0
154
+ ftfy==6.3.1
155
+ protobuf==6.31.0
156
+ urllib3==2.4.0
157
+ wheel==0.45.1
158
+ nltk==3.9.1
159
+ streamlit==1.45.1
160
+ wasabi==1.1.3
161
+ pre_commit==4.2.0
162
+ safetensors==0.5.3
163
+ jsonschema-specifications==2025.4.1
164
+ langcodes==3.5.0
165
+ GitPython==3.1.44
166
+ blinker==1.9.0
167
+ torchvision==0.21.0
168
+ MarkupSafe==3.0.2
169
+ dill==0.3.8
170
+ yacs==0.1.8
171
+ pathlib==1.0.1
172
+ scikit-learn==1.6.1
173
+ cffi==1.17.1
174
+ pycparser==2.22
175
+ flash-attn==2.7.1.post1
176
+ cryptography==45.0.3
177
+ pycryptodome==3.23.0
178
+ cheroot==10.0.1
179
+ more-itertools==10.7.0
180
+ setproctitle==1.3.6
181
+ delta-center-client==0.0.4
182
+ jmespath==0.10.0
183
+ xxhash==3.5.0
184
+ pip==25.1.1
185
+ aliyun-python-sdk-core==2.16.0
186
+ jaraco.functools==4.1.0
187
+ bigmodelvis==0.0.1
188
+ aiohttp==3.12.2
189
+ multiprocess==0.70.16
190
+ opendelta==0.3.2
191
+ docker-pycreds==0.4.0
192
+ threadpoolctl==3.6.0
193
+ click==8.2.1
194
+ oss2==2.15.0
195
+ crcmod==1.7
196
+ transformers==4.52.3
197
+ datasets==3.6.0
198
+ jsonschema==4.24.0
199
+ opencv-python==4.11.0.86
200
+ wandb==0.19.11
201
+ fsspec==2025.3.0
202
+ tokenizers==0.21.1
203
+ sentry-sdk==2.29.1
204
+ preshed==3.0.10
205
+ aliyun-python-sdk-kms==2.16.5
206
+ huggingface-hub==0.32.1
207
+ typer==0.16.0
208
+ narwhals==1.41.0
209
+ web.py==0.62
210
+ autocommand==2.2.2
211
+ importlib_metadata==8.0.0
212
+ zipp==3.19.2
213
+ jaraco.context==5.3.0
214
+ typeguard==4.3.0
215
+ jaraco.collections==5.1.0
216
+ typing_extensions==4.12.2
217
+ backports.tarfile==1.2.0
218
+ jaraco.functools==4.0.1
219
+ more-itertools==10.3.0
220
+ platformdirs==4.2.2
221
+ packaging==24.2
222
+ tomli==2.0.1
223
+ jaraco.text==3.12.1
224
+ wheel==0.45.1
225
+ inflect==7.3.1
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-metadata.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T16:07:02.125247Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06282348_ddp",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "96",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger",
30
+ "--strategy",
31
+ "ddp"
32
+ ],
33
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
34
+ "codePath": "stage1.py",
35
+ "root": "./all_checkpoints/stage1_06282348_ddp/",
36
+ "host": "dsw-265304-57b7b77cbc-vwbwc",
37
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
38
+ "codePathLocal": "stage1.py",
39
+ "cpu_count": 64,
40
+ "cpu_count_logical": 64,
41
+ "gpu": "NVIDIA A800-SXM4-80GB",
42
+ "gpu_count": 8,
43
+ "disk": {
44
+ "/": {
45
+ "total": "1623302262784",
46
+ "used": "1285242880"
47
+ }
48
+ },
49
+ "memory": {
50
+ "total": "549755813888"
51
+ },
52
+ "cpu": {
53
+ "count": 64,
54
+ "countLogical": 64
55
+ },
56
+ "gpu_nvidia": [
57
+ {
58
+ "name": "NVIDIA A800-SXM4-80GB",
59
+ "memoryTotal": "85198045184",
60
+ "architecture": "Ampere"
61
+ },
62
+ {
63
+ "name": "NVIDIA A800-SXM4-80GB",
64
+ "memoryTotal": "85198045184",
65
+ "architecture": "Ampere"
66
+ },
67
+ {
68
+ "name": "NVIDIA A800-SXM4-80GB",
69
+ "memoryTotal": "85198045184",
70
+ "architecture": "Ampere"
71
+ },
72
+ {
73
+ "name": "NVIDIA A800-SXM4-80GB",
74
+ "memoryTotal": "85198045184",
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA A800-SXM4-80GB",
79
+ "memoryTotal": "85198045184",
80
+ "architecture": "Ampere"
81
+ },
82
+ {
83
+ "name": "NVIDIA A800-SXM4-80GB",
84
+ "memoryTotal": "85198045184",
85
+ "architecture": "Ampere"
86
+ },
87
+ {
88
+ "name": "NVIDIA A800-SXM4-80GB",
89
+ "memoryTotal": "85198045184",
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A800-SXM4-80GB",
94
+ "memoryTotal": "85198045184",
95
+ "architecture": "Ampere"
96
+ }
97
+ ],
98
+ "cudaVersion": "12.1"
99
+ }
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"loader1/val_loss_lm/dataloader_idx_1":6.895899772644043,"loader2/val_loss_lm/dataloader_idx_2":7.282508373260498,"loader0/val_loss_ptc/dataloader_idx_0":0.645770788192749,"loader0/val_loss_lm/dataloader_idx_0":2.0093350410461426,"loader2/val_loss_ptc/dataloader_idx_2":4.874267101287842,"_step":243,"lr":1.0554024811426643e-05,"loader0/val_loss/dataloader_idx_0":3.0574357509613037,"train_loss_lm":1.5327174663543701,"train_loss":2.090651035308838,"loader0/val_loss_ptm/dataloader_idx_0":0.4023294746875763,"_runtime":36514.777432942,"loader2/val_loss_ptm/dataloader_idx_2":2.5906364917755127,"trainer/global_step":11199,"loader1/val_loss/dataloader_idx_1":14.83725357055664,"loader1/val_loss_ptm/dataloader_idx_1":2.7793824672698975,"_wandb":{"runtime":36546},"loader2/val_loss/dataloader_idx_2":14.74741268157959,"train_loss_ptm":0.22319991886615753,"train_loss_ptc":0.33473363518714905,"_timestamp":1.751163336902292e+09,"loader1/val_loss_ptc/dataloader_idx_1":5.1619720458984375,"epoch":19}
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug-internal.log ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
2
+ {"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
3
+ {"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
4
+ {"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
5
+ {"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
6
+ {"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
7
+ {"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
8
+ {"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
10
+ {"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
11
+ {"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
12
+ {"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
13
+ {"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
14
+ {"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
15
+ {"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
16
+ {"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
17
+ {"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
18
+ {"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
20
+ {"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
21
+ {"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
22
+ {"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
23
+ {"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
24
+ {"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
25
+ {"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
26
+ {"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
27
+ {"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
28
+ {"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
29
+ {"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
30
+ {"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
32
+ {"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
33
+ {"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
34
+ {"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
35
+ {"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
36
+ {"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
37
+ {"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
38
+ {"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
39
+ {"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
40
+ {"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
41
+ {"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
42
+ {"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
3
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
7
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
8
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
12
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/config.yaml ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ - "1": train_loss
10
+ "5": 1
11
+ "6":
12
+ - 1
13
+ - 3
14
+ "7": []
15
+ - "1": loader1/val_loss/dataloader_idx_1
16
+ "5": 1
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": loader0/val_loss_lm/dataloader_idx_0
22
+ "5": 1
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": loader1/val_loss_ptm/dataloader_idx_1
28
+ "5": 1
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": loader2/val_loss_lm/dataloader_idx_2
34
+ "5": 1
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": loader1/val_loss_ptc/dataloader_idx_1
40
+ "5": 1
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": epoch
46
+ "5": 1
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": lr
52
+ "5": 1
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": loader2/val_loss_ptc/dataloader_idx_2
58
+ "5": 1
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": loader0/val_loss_ptm/dataloader_idx_0
64
+ "5": 1
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": train_loss_ptc
70
+ "5": 1
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": train_loss_ptm
76
+ "5": 1
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ - "1": train_loss_lm
82
+ "5": 1
83
+ "6":
84
+ - 1
85
+ - 3
86
+ "7": []
87
+ - "1": loader2/val_loss_ptm/dataloader_idx_2
88
+ "5": 1
89
+ "6":
90
+ - 1
91
+ - 3
92
+ "7": []
93
+ - "1": loader0/val_loss/dataloader_idx_0
94
+ "5": 1
95
+ "6":
96
+ - 1
97
+ - 3
98
+ "7": []
99
+ - "1": loader2/val_loss/dataloader_idx_2
100
+ "5": 1
101
+ "6":
102
+ - 1
103
+ - 3
104
+ "7": []
105
+ - "1": loader1/val_loss_lm/dataloader_idx_1
106
+ "5": 1
107
+ "6":
108
+ - 1
109
+ - 3
110
+ "7": []
111
+ - "1": loader0/val_loss_ptc/dataloader_idx_0
112
+ "5": 1
113
+ "6":
114
+ - 1
115
+ - 3
116
+ "7": []
117
+ python_version: 3.10.0
118
+ t:
119
+ "1":
120
+ - 1
121
+ - 5
122
+ - 9
123
+ - 11
124
+ - 33
125
+ - 41
126
+ - 49
127
+ - 53
128
+ - 55
129
+ - 63
130
+ - 103
131
+ "2":
132
+ - 1
133
+ - 5
134
+ - 9
135
+ - 11
136
+ - 33
137
+ - 41
138
+ - 49
139
+ - 53
140
+ - 55
141
+ - 63
142
+ - 103
143
+ "3":
144
+ - 7
145
+ - 23
146
+ - 55
147
+ - 66
148
+ "4": 3.10.0
149
+ "5": 0.19.11
150
+ "6": 4.52.3
151
+ "8":
152
+ - 5
153
+ "12": 0.19.11
154
+ "13": linux-x86_64
155
+ accelerator:
156
+ value: gpu
157
+ batch_size:
158
+ value: 168
159
+ bert_hidden_dim:
160
+ value: 768
161
+ bert_name:
162
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
163
+ check_val_every_n_epoch:
164
+ value: 1
165
+ cross_attention_freq:
166
+ value: 2
167
+ devices:
168
+ value: 0,1,2,3,4,5,6,7
169
+ filename:
170
+ value: stage1_06290009_deepspeed
171
+ init_checkpoint:
172
+ value: ""
173
+ init_lr:
174
+ value: 0.0001
175
+ lm:
176
+ value: true
177
+ load_4bit:
178
+ value: false
179
+ lr_decay_rate:
180
+ value: 0.9
181
+ match_batch_size:
182
+ value: 64
183
+ max_epochs:
184
+ value: 20
185
+ min_lr:
186
+ value: 1e-05
187
+ mix_dataset:
188
+ value: true
189
+ mode:
190
+ value: train
191
+ num_query_token:
192
+ value: 8
193
+ num_workers:
194
+ value: 8
195
+ plm_name:
196
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
197
+ plm_tune:
198
+ value: freeze
199
+ pool_size:
200
+ value: 0
201
+ precision:
202
+ value: bf16-mixed
203
+ projection_dim:
204
+ value: 256
205
+ prot_aug:
206
+ value: None
207
+ prot_max_len:
208
+ value: 1024
209
+ ptm:
210
+ value: true
211
+ rerank_cand_num:
212
+ value: 128
213
+ retrieval_eval_epoch:
214
+ value: 10
215
+ root:
216
+ value: data
217
+ save_every_n_epochs:
218
+ value: 5
219
+ scheduler:
220
+ value: linear_warmup_cosine_lr
221
+ seed:
222
+ value: 42
223
+ strategy:
224
+ value: deepspeed
225
+ temperature:
226
+ value: 0.1
227
+ text_max_len:
228
+ value: 128
229
+ use_wandb_logger:
230
+ value: true
231
+ warmup_lr:
232
+ value: 1e-06
233
+ warmup_steps:
234
+ value: 1000
235
+ weight_decay:
236
+ value: 0.05
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/output.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Epoch 19: 100%|███████████████████████████████████████████| 320/320 [17:07<00:00, 0.31it/s, v_num=xzqc]
16
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
17
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
18
+
19
+ /nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
20
+ sd = self.module.state_dict(destination, prefix, keep_vars)
21
+ `Trainer.fit` stopped: `max_epochs=20` reached.
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ opendatasets==0.1.22
2
+ salesforce-lavis==1.0.2
3
+ Pygments==2.19.1
4
+ nvidia-nccl-cu12==2.21.5
5
+ tornado==6.5.1
6
+ nvidia-cuda-runtime-cu12==12.4.127
7
+ requests==2.32.3
8
+ nvidia-cuda-cupti-cu12==12.4.127
9
+ decord==0.6.0
10
+ braceexpand==0.1.7
11
+ frozenlist==1.6.0
12
+ markdown-it-py==3.0.0
13
+ shellingham==1.5.4
14
+ absl-py==2.2.2
15
+ pycocoevalcap==1.2
16
+ contexttimer==0.3.3
17
+ bleach==6.2.0
18
+ jsonschema-specifications==2025.4.1
19
+ pycocotools==2.0.8
20
+ python-slugify==8.0.4
21
+ tqdm==4.67.1
22
+ numpy==2.2.6
23
+ urllib3==2.4.0
24
+ deepspeed==0.16.10+b666844f
25
+ watchdog==6.0.0
26
+ wrapt==1.17.2
27
+ setuptools==78.1.1
28
+ matplotlib==3.10.3
29
+ pydeck==0.9.1
30
+ aiosignal==1.3.2
31
+ gitdb==4.0.12
32
+ hjson==3.1.0
33
+ timm==0.4.12
34
+ blis==1.3.0
35
+ PyYAML==6.0.2
36
+ referencing==0.36.2
37
+ contourpy==1.3.2
38
+ kaggle==1.7.4.5
39
+ triton==3.2.0
40
+ catalogue==2.0.10
41
+ idna==3.10
42
+ torch==2.6.0
43
+ text-unidecode==1.3
44
+ altair==5.5.0
45
+ cloudpathlib==0.21.1
46
+ protobuf==6.31.0
47
+ nvidia-cusolver-cu12==11.6.1.9
48
+ pytz==2025.2
49
+ sympy==1.13.1
50
+ spacy==3.8.7
51
+ MarkupSafe==3.0.2
52
+ thinc==8.3.6
53
+ nvidia-cudnn-cu12==9.1.0.70
54
+ wasabi==1.1.3
55
+ aiohappyeyeballs==2.6.1
56
+ nvidia-nvtx-cu12==12.4.127
57
+ rich==14.0.0
58
+ ipython==8.36.0
59
+ yarl==1.20.0
60
+ torchmetrics==1.7.1
61
+ multidict==6.4.4
62
+ cfgv==3.4.0
63
+ smmap==5.0.2
64
+ srsly==2.5.1
65
+ scikit-image==0.25.2
66
+ matplotlib-inline==0.1.7
67
+ annotated-types==0.7.0
68
+ lazy_loader==0.4
69
+ tenacity==9.1.2
70
+ GitPython==3.1.44
71
+ language_data==1.3.0
72
+ pydantic_core==2.33.2
73
+ sentencepiece==0.2.0
74
+ platformdirs==4.3.8
75
+ distlib==0.3.9
76
+ nvidia-cusparselt-cu12==0.6.2
77
+ blinker==1.9.0
78
+ regex==2024.11.6
79
+ tifffile==2025.5.10
80
+ py-cpuinfo==9.0.0
81
+ attrs==25.3.0
82
+ mdurl==0.1.2
83
+ prompt_toolkit==3.0.51
84
+ packaging==24.2
85
+ async-timeout==5.0.1
86
+ six==1.17.0
87
+ executing==2.2.0
88
+ parso==0.8.4
89
+ omegaconf==2.3.0
90
+ wcwidth==0.2.13
91
+ murmurhash==1.0.13
92
+ stack-data==0.6.3
93
+ nvidia-cufft-cu12==11.2.1.3
94
+ virtualenv==20.31.2
95
+ langcodes==3.5.0
96
+ fonttools==4.58.0
97
+ opencv-python-headless==4.5.5.64
98
+ jedi==0.19.2
99
+ torchvision==0.21.0
100
+ plotly==6.1.1
101
+ nodeenv==1.9.1
102
+ smart-open==7.1.0
103
+ toml==0.10.2
104
+ pytorch-lightning==2.5.1.post0
105
+ typing_extensions==4.13.2
106
+ safetensors==0.5.3
107
+ psutil==7.0.0
108
+ pillow==11.2.1
109
+ python-dateutil==2.9.0.post0
110
+ ftfy==6.3.1
111
+ scipy==1.15.3
112
+ webdataset==0.2.111
113
+ charset-normalizer==3.4.2
114
+ nvidia-nvjitlink-cu12==12.4.127
115
+ kiwisolver==1.4.8
116
+ nvidia-ml-py==12.575.51
117
+ confection==0.1.5
118
+ nvidia-curand-cu12==10.3.5.147
119
+ pandas==2.2.3
120
+ nltk==3.9.1
121
+ webencodings==0.5.1
122
+ pyarrow==20.0.0
123
+ asttokens==3.0.0
124
+ exceptiongroup==1.3.0
125
+ pre_commit==4.2.0
126
+ ninja==1.11.1.4
127
+ spacy-loggers==1.0.5
128
+ msgpack==1.1.0
129
+ lightning-utilities==0.14.3
130
+ nvidia-cublas-cu12==12.4.5.8
131
+ tzdata==2025.2
132
+ cycler==0.12.1
133
+ hf-xet==1.1.2
134
+ antlr4-python3-runtime==4.9.3
135
+ iopath==0.1.10
136
+ pexpect==4.9.0
137
+ imageio==2.37.0
138
+ streamlit==1.45.1
139
+ python-magic==0.4.27
140
+ networkx==3.4.2
141
+ portalocker==3.1.1
142
+ nvidia-cusparse-cu12==12.3.1.170
143
+ propcache==0.3.1
144
+ ptyprocess==0.7.0
145
+ fairscale==0.4.4
146
+ rpds-py==0.25.1
147
+ certifi==2025.4.26
148
+ rouge_score==0.1.2
149
+ traitlets==5.14.3
150
+ identify==2.6.12
151
+ spacy-legacy==3.0.12
152
+ weasel==0.4.1
153
+ mpmath==1.3.0
154
+ cymem==2.0.11
155
+ typing-inspection==0.4.1
156
+ nvidia-cuda-nvrtc-cu12==12.4.127
157
+ marisa-trie==1.2.1
158
+ einops==0.8.1
159
+ nvidia-cufile-cu12==1.11.1.6
160
+ pydantic==2.11.5
161
+ cachetools==5.5.2
162
+ joblib==1.5.1
163
+ Jinja2==3.1.6
164
+ filelock==3.18.0
165
+ pyparsing==3.2.3
166
+ pure_eval==0.2.3
167
+ decorator==5.2.1
168
+ wheel==0.45.1
169
+ pycryptodome==3.23.0
170
+ cheroot==10.0.1
171
+ multiprocess==0.70.16
172
+ aiohttp==3.12.2
173
+ crcmod==1.7
174
+ fsspec==2025.3.0
175
+ jmespath==0.10.0
176
+ preshed==3.0.10
177
+ jaraco.functools==4.1.0
178
+ cryptography==45.0.3
179
+ sentry-sdk==2.29.1
180
+ tokenizers==0.21.1
181
+ opendelta==0.3.2
182
+ pycparser==2.22
183
+ narwhals==1.41.0
184
+ scikit-learn==1.6.1
185
+ dill==0.3.8
186
+ oss2==2.15.0
187
+ yacs==0.1.8
188
+ more-itertools==10.7.0
189
+ pip==25.1.1
190
+ threadpoolctl==3.6.0
191
+ flash-attn==2.7.1.post1
192
+ bigmodelvis==0.0.1
193
+ pathlib==1.0.1
194
+ delta-center-client==0.0.4
195
+ xxhash==3.5.0
196
+ wandb==0.19.11
197
+ setproctitle==1.3.6
198
+ aliyun-python-sdk-core==2.16.0
199
+ transformers==4.52.3
200
+ aliyun-python-sdk-kms==2.16.5
201
+ datasets==3.6.0
202
+ typer==0.16.0
203
+ docker-pycreds==0.4.0
204
+ click==8.2.1
205
+ huggingface-hub==0.32.1
206
+ web.py==0.62
207
+ cffi==1.17.1
208
+ opencv-python==4.11.0.86
209
+ jsonschema==4.24.0
210
+ typing_extensions==4.12.2
211
+ jaraco.functools==4.0.1
212
+ jaraco.text==3.12.1
213
+ jaraco.collections==5.1.0
214
+ inflect==7.3.1
215
+ more-itertools==10.3.0
216
+ packaging==24.2
217
+ importlib_metadata==8.0.0
218
+ backports.tarfile==1.2.0
219
+ typeguard==4.3.0
220
+ zipp==3.19.2
221
+ platformdirs==4.2.2
222
+ autocommand==2.2.2
223
+ jaraco.context==5.3.0
224
+ tomli==2.0.1
225
+ wheel==0.45.1
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T16:12:00.926076Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_06290009_deepspeed",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "168",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "root": "./all_checkpoints/stage1_06290009_deepspeed/",
34
+ "host": "dsw-251511-c65bb988c-9g24f",
35
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
36
+ "codePathLocal": "stage1.py",
37
+ "cpu_count": 64,
38
+ "cpu_count_logical": 64,
39
+ "gpu": "NVIDIA A800-SXM4-80GB",
40
+ "gpu_count": 8,
41
+ "disk": {
42
+ "/": {
43
+ "total": "1623302262784",
44
+ "used": "987680768"
45
+ }
46
+ },
47
+ "memory": {
48
+ "total": "549755813888"
49
+ },
50
+ "cpu": {
51
+ "count": 64,
52
+ "countLogical": 64
53
+ },
54
+ "gpu_nvidia": [
55
+ {
56
+ "name": "NVIDIA A800-SXM4-80GB",
57
+ "memoryTotal": "85198045184",
58
+ "architecture": "Ampere"
59
+ },
60
+ {
61
+ "name": "NVIDIA A800-SXM4-80GB",
62
+ "memoryTotal": "85198045184",
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA A800-SXM4-80GB",
67
+ "memoryTotal": "85198045184",
68
+ "architecture": "Ampere"
69
+ },
70
+ {
71
+ "name": "NVIDIA A800-SXM4-80GB",
72
+ "memoryTotal": "85198045184",
73
+ "architecture": "Ampere"
74
+ },
75
+ {
76
+ "name": "NVIDIA A800-SXM4-80GB",
77
+ "memoryTotal": "85198045184",
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A800-SXM4-80GB",
82
+ "memoryTotal": "85198045184",
83
+ "architecture": "Ampere"
84
+ },
85
+ {
86
+ "name": "NVIDIA A800-SXM4-80GB",
87
+ "memoryTotal": "85198045184",
88
+ "architecture": "Ampere"
89
+ },
90
+ {
91
+ "name": "NVIDIA A800-SXM4-80GB",
92
+ "memoryTotal": "85198045184",
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "12.1"
97
+ }
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"loader1/val_loss_lm/dataloader_idx_1":5.460818767547607,"lr":1.0554024811426643e-05,"_timestamp":1.7511476307831557e+09,"loader1/val_loss/dataloader_idx_1":14.773687362670898,"train_loss":2.630859375,"loader2/val_loss_ptm/dataloader_idx_2":2.988206148147583,"loader1/val_loss_ptm/dataloader_idx_1":3.491912603378296,"loader2/val_loss_lm/dataloader_idx_2":6.094737529754639,"_runtime":20509.857471191,"loader2/val_loss_ptc/dataloader_idx_2":5.59119987487793,"_wandb":{"runtime":20517},"train_loss_ptc":0.64306640625,"train_loss_ptm":0.291015625,"loader0/val_loss_ptc/dataloader_idx_0":1.0085703134536743,"loader1/val_loss_ptc/dataloader_idx_1":5.825062274932861,"loader0/val_loss_lm/dataloader_idx_0":2.1440062522888184,"loader2/val_loss/dataloader_idx_2":14.674175262451172,"epoch":19,"train_loss_lm":1.697265625,"_step":147,"loader0/val_loss/dataloader_idx_0":3.6699562072753906,"loader0/val_loss_ptm/dataloader_idx_0":0.5172882676124573,"trainer/global_step":6399}
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-29T00:12:00.930911849+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-core.log"}
2
+ {"time":"2025-06-29T00:12:31.03707486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
3
+ {"time":"2025-06-29T00:12:36.876219526+08:00","level":"INFO","msg":"created new stream","id":"vgvxxzqc"}
4
+ {"time":"2025-06-29T00:12:36.876272436+08:00","level":"INFO","msg":"stream: started","id":"vgvxxzqc"}
5
+ {"time":"2025-06-29T00:12:36.876317878+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vgvxxzqc"}
6
+ {"time":"2025-06-29T00:12:36.876360145+08:00","level":"INFO","msg":"handler: started","stream_id":"vgvxxzqc"}
7
+ {"time":"2025-06-29T00:12:36.876401621+08:00","level":"INFO","msg":"sender: started","stream_id":"vgvxxzqc"}
8
+ {"time":"2025-06-29T00:12:39.839397838+08:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2025-06-29T00:13:20.87652364+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58014->104.21.20.172:443: read: connection reset by peer"}
10
+ {"time":"2025-06-29T00:16:17.426969211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
11
+ {"time":"2025-06-29T00:17:58.51721346+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
12
+ {"time":"2025-06-29T00:26:10.222503445+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51478->172.67.193.61:443: read: connection timed out"}
13
+ {"time":"2025-06-29T00:28:34.807681677+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46066->104.21.20.172:443: read: connection reset by peer"}
14
+ {"time":"2025-06-29T00:33:01.870533298+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55734->104.21.20.172:443: read: connection timed out"}
15
+ {"time":"2025-06-29T00:33:23.754842161+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:46042->172.67.193.61:443: read: connection reset by peer"}
16
+ {"time":"2025-06-29T00:36:47.149592254+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:57176->172.67.193.61:443: read: connection timed out"}
17
+ {"time":"2025-06-29T00:41:05.645712211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
18
+ {"time":"2025-06-29T00:47:20.493525378+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:58300->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-06-29T00:47:45.629981285+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
20
+ {"time":"2025-06-29T00:51:19.597480154+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:60802->104.21.20.172:443: read: connection timed out"}
21
+ {"time":"2025-06-29T00:54:09.777365701+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:51532->104.21.20.172:443: read: connection reset by peer"}
22
+ {"time":"2025-06-29T01:00:20.78154967+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:34934->172.67.193.61:443: read: connection timed out"}
23
+ {"time":"2025-06-29T01:04:21.421531776+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:55938->104.21.20.172:443: read: connection timed out"}
24
+ {"time":"2025-06-29T01:05:41.05509194+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45678->172.67.193.61:443: read: connection reset by peer"}
25
+ {"time":"2025-06-29T01:08:44.130266043+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
26
+ {"time":"2025-06-29T01:10:59.724692621+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
27
+ {"time":"2025-06-29T01:14:35.821570745+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:35232->104.21.20.172:443: read: connection timed out"}
28
+ {"time":"2025-06-29T01:17:35.533530754+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:44258->104.21.20.172:443: read: connection timed out"}
29
+ {"time":"2025-06-29T01:23:12.630381225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:48108->172.67.193.61:443: read: connection reset by peer"}
30
+ {"time":"2025-06-29T01:24:55.067569821+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T01:25:27.188065501+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
32
+ {"time":"2025-06-29T01:25:56.782489942+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:45024->104.21.20.172:443: read: connection timed out"}
33
+ {"time":"2025-06-29T01:26:49.538892815+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": unexpected EOF"}
34
+ {"time":"2025-06-29T01:29:46.157546022+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06290009_deepspeed/vgvxxzqc/file_stream\": read tcp 10.1.7.100:42508->172.67.193.61:443: read: connection timed out"}
35
+ {"time":"2025-06-29T05:53:57.9412544+08:00","level":"INFO","msg":"stream: closing","id":"vgvxxzqc"}
36
+ {"time":"2025-06-29T05:53:57.941286983+08:00","level":"INFO","msg":"Stopping system monitor"}
37
+ {"time":"2025-06-29T05:53:57.942366437+08:00","level":"INFO","msg":"Stopped system monitor"}
38
+ {"time":"2025-06-29T05:54:00.869660002+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
39
+ {"time":"2025-06-29T05:54:03.731237694+08:00","level":"INFO","msg":"handler: closed","stream_id":"vgvxxzqc"}
40
+ {"time":"2025-06-29T05:54:03.731282348+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"vgvxxzqc"}
41
+ {"time":"2025-06-29T05:54:03.731313818+08:00","level":"INFO","msg":"sender: closed","stream_id":"vgvxxzqc"}
42
+ {"time":"2025-06-29T05:54:03.735031072+08:00","level":"INFO","msg":"stream: closed","id":"vgvxxzqc"}
ProtT3/all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Configure stats pid to 2351
3
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug.log
7
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06290009_deepspeed/wandb/run-20250629_001200-vgvxxzqc/logs/debug-internal.log
8
+ 2025-06-29 00:12:00,920 INFO MainThread:2351 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():893] starting backend
12
+ 2025-06-29 00:12:00,921 INFO MainThread:2351 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-29 00:12:00,923 INFO MainThread:2351 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-29 00:12:00,924 INFO MainThread:2351 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-29 00:12:00,926 INFO MainThread:2351 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-29 00:12:00,929 INFO MainThread:2351 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-29 00:12:39,788 INFO MainThread:2351 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-29 00:12:40,030 INFO MainThread:2351 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-29 00:12:40,034 INFO MainThread:2351 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-29 00:12:40,036 INFO MainThread:2351 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-29 00:12:46,669 INFO MainThread:2351 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_06290009_deepspeed', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 20, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 168, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-06-29 05:53:57,929 INFO MsgRouterThr:2351 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug-internal.log ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-04T17:28:55.160594539+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-core.log"}
2
+ {"time":"2025-07-04T17:28:56.249328974+08:00","level":"INFO","msg":"created new stream","id":"6bkqzmou"}
3
+ {"time":"2025-07-04T17:28:56.249372351+08:00","level":"INFO","msg":"stream: started","id":"6bkqzmou"}
4
+ {"time":"2025-07-04T17:28:56.249400451+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"6bkqzmou"}
5
+ {"time":"2025-07-04T17:28:56.249431272+08:00","level":"INFO","msg":"sender: started","stream_id":"6bkqzmou"}
6
+ {"time":"2025-07-04T17:28:56.249469216+08:00","level":"INFO","msg":"handler: started","stream_id":"6bkqzmou"}
7
+ {"time":"2025-07-04T17:28:57.491653525+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-07-04T22:06:52.200518707+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
9
+ {"time":"2025-07-04T22:09:45.338273816+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
10
+ {"time":"2025-07-04T22:19:20.574743081+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59078->172.67.193.61:443: read: connection timed out"}
11
+ {"time":"2025-07-04T22:25:54.288016702+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42132->172.67.193.61:443: read: connection timed out"}
12
+ {"time":"2025-07-04T22:29:40.591991523+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37544->172.67.193.61:443: read: connection timed out"}
13
+ {"time":"2025-07-04T22:36:54.256091094+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40968->172.67.193.61:443: read: connection timed out"}
14
+ {"time":"2025-07-04T22:37:22.364944108+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
15
+ {"time":"2025-07-04T22:40:24.499117928+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
16
+ {"time":"2025-07-04T22:40:51.249223858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
17
+ {"time":"2025-07-04T22:44:05.872015851+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59778->172.67.193.61:443: read: connection timed out"}
18
+ {"time":"2025-07-04T22:49:18.192032141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:47358->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-07-04T22:52:13.295997002+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46182->172.67.193.61:443: read: connection timed out"}
20
+ {"time":"2025-07-04T22:53:26.345699486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
21
+ {"time":"2025-07-04T22:55:37.691524069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
22
+ {"time":"2025-07-04T22:59:01.477384402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
23
+ {"time":"2025-07-04T23:01:22.224282887+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:45222->172.67.193.61:443: read: connection reset by peer"}
24
+ {"time":"2025-07-04T23:06:44.720013857+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46280->104.21.20.172:443: read: connection timed out"}
25
+ {"time":"2025-07-04T23:08:42.894770628+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
26
+ {"time":"2025-07-04T23:10:13.616061547+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57034->172.67.193.61:443: read: connection timed out"}
27
+ {"time":"2025-07-04T23:11:27.896127402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
28
+ {"time":"2025-07-04T23:15:47.19805854+08:00","level":"ERROR","msg":"filestream: json decode error: net/http: request canceled (Client.Timeout or context cancellation while reading body)"}
29
+ {"time":"2025-07-04T23:15:47.222866077+08:00","level":"ERROR","msg":"filestream: error closing response body: net/http: request canceled"}
30
+ {"time":"2025-07-04T23:19:26.063989295+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:35504->172.67.193.61:443: read: connection timed out"}
31
+ {"time":"2025-07-04T23:21:57.905369451+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
32
+ {"time":"2025-07-04T23:23:00.080992848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38378->172.67.193.61:443: read: connection timed out"}
33
+ {"time":"2025-07-04T23:26:54.577250259+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:41928->104.21.20.172:443: read: connection reset by peer"}
34
+ {"time":"2025-07-04T23:28:47.703904029+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38040->172.67.193.61:443: read: connection reset by peer"}
35
+ {"time":"2025-07-04T23:30:12.910139882+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
36
+ {"time":"2025-07-04T23:30:45.313312591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
37
+ {"time":"2025-07-04T23:35:25.039973358+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38402->104.21.20.172:443: read: connection timed out"}
38
+ {"time":"2025-07-04T23:39:27.49206097+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39926->172.67.193.61:443: read: connection reset by peer"}
39
+ {"time":"2025-07-04T23:43:09.424012888+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59448->172.67.193.61:443: read: connection timed out"}
40
+ {"time":"2025-07-04T23:46:07.600020006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33610->172.67.193.61:443: read: connection timed out"}
41
+ {"time":"2025-07-04T23:46:28.951111977+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
42
+ {"time":"2025-07-04T23:48:12.919414088+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
43
+ {"time":"2025-07-04T23:48:45.403207458+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
44
+ {"time":"2025-07-04T23:49:16.527984782+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:49472->172.67.193.61:443: read: connection timed out"}
45
+ {"time":"2025-07-04T23:50:27.921623046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
46
+ {"time":"2025-07-04T23:52:57.899934024+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40636->172.67.193.61:443: read: connection reset by peer"}
47
+ {"time":"2025-07-04T23:56:18.928962652+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53056->172.67.193.61:443: read: connection timed out"}
48
+ {"time":"2025-07-04T23:56:57.924908638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
49
+ {"time":"2025-07-04T23:57:30.375318804+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
50
+ {"time":"2025-07-04T23:58:04.586629939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
51
+ {"time":"2025-07-05T00:03:09.552010125+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39630->172.67.193.61:443: read: connection timed out"}
52
+ {"time":"2025-07-05T00:03:42.930344983+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
53
+ {"time":"2025-07-05T00:04:15.375941679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
54
+ {"time":"2025-07-05T00:04:21.655905995+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:48150->172.67.193.61:443: read: connection reset by peer"}
55
+ {"time":"2025-07-05T00:04:50.22664016+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
56
+ {"time":"2025-07-05T00:08:01.391966638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53260->104.21.20.172:443: read: connection timed out"}
57
+ {"time":"2025-07-05T00:12:23.023992865+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:44698->172.67.193.61:443: read: connection timed out"}
58
+ {"time":"2025-07-05T00:15:42.93731147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
59
+ {"time":"2025-07-05T00:16:15.214992648+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
60
+ {"time":"2025-07-05T00:16:49.667525584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
61
+ {"time":"2025-07-05T00:17:01.040050871+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42760->172.67.193.61:443: read: connection timed out"}
62
+ {"time":"2025-07-05T00:20:02.288062562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37934->104.21.20.172:443: read: connection timed out"}
63
+ {"time":"2025-07-05T00:23:13.264033499+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:51502->172.67.193.61:443: read: connection timed out"}
64
+ {"time":"2025-07-05T00:26:32.944001316+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:50460->172.67.193.61:443: read: connection timed out"}
65
+ {"time":"2025-07-05T00:30:17.199971228+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:43756->104.21.20.172:443: read: connection timed out"}
66
+ {"time":"2025-07-05T00:35:42.94748626+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
67
+ {"time":"2025-07-05T00:35:44.881161178+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59610->172.67.193.61:443: read: connection timed out"}
68
+ {"time":"2025-07-05T00:39:14.287974585+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46688->104.21.20.172:443: read: connection timed out"}
69
+ {"time":"2025-07-05T00:39:33.427697791+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40430->104.21.20.172:443: read: connection reset by peer"}
70
+ {"time":"2025-07-05T00:40:47.648388331+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:60164->172.67.193.61:443: read: connection reset by peer"}
71
+ {"time":"2025-07-05T00:42:34.088456552+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52978->104.21.20.172:443: read: connection reset by peer"}
72
+ {"time":"2025-07-05T00:47:13.006425282+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33354->172.67.193.61:443: read: connection reset by peer"}
73
+ {"time":"2025-07-05T00:49:55.823998082+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33330->172.67.193.61:443: read: connection timed out"}
74
+ {"time":"2025-07-05T00:52:51.439993456+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52416->172.67.193.61:443: read: connection timed out"}
75
+ {"time":"2025-07-05T00:57:07.440983899+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39188->172.67.193.61:443: read: connection timed out"}
76
+ {"time":"2025-07-05T01:04:51.312039238+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59436->172.67.193.61:443: read: connection timed out"}
77
+ {"time":"2025-07-05T01:09:40.080000713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:34348->172.67.193.61:443: read: connection timed out"}
78
+ {"time":"2025-07-05T01:18:31.535996696+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57184->104.21.20.172:443: read: connection timed out"}
79
+ {"time":"2025-07-05T01:25:12.431983593+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59852->172.67.193.61:443: read: connection timed out"}
80
+ {"time":"2025-07-05T11:36:49.210079644+08:00","level":"INFO","msg":"stream: closing","id":"6bkqzmou"}
81
+ {"time":"2025-07-05T11:36:49.210163239+08:00","level":"INFO","msg":"Stopping system monitor"}
82
+ {"time":"2025-07-05T11:36:49.211103046+08:00","level":"INFO","msg":"Stopped system monitor"}
83
+ {"time":"2025-07-05T11:36:51.804545543+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
84
+ {"time":"2025-07-05T11:36:53.755788884+08:00","level":"INFO","msg":"handler: closed","stream_id":"6bkqzmou"}
85
+ {"time":"2025-07-05T11:36:53.755828602+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"6bkqzmou"}
86
+ {"time":"2025-07-05T11:36:53.75584333+08:00","level":"INFO","msg":"sender: closed","stream_id":"6bkqzmou"}
87
+ {"time":"2025-07-05T11:36:53.759902053+08:00","level":"INFO","msg":"stream: closed","id":"6bkqzmou"}
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Configure stats pid to 29356
3
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log
7
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log
8
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():893] starting backend
12
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-04 17:28:55,102 INFO MainThread:29356 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-04 17:28:55,102 INFO MainThread:29356 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-04 17:28:57,453 INFO MainThread:29356 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-04 17:28:57,668 INFO MainThread:29356 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-04 17:28:57,669 INFO MainThread:29356 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-04 17:28:57,678 INFO MainThread:29356 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-04 17:28:57,684 INFO MainThread:29356 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-04 17:28:57,686 INFO MainThread:29356 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-07-04 17:29:03,015 INFO MainThread:29356 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_07041727_2dataset', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 30, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-07-05 11:36:49,208 INFO MsgRouterThr:29356 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/config.yaml ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": loader1/val_loss_lm/dataloader_idx_1
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": trainer/global_step
12
+ "6":
13
+ - 3
14
+ "7": []
15
+ - "1": loader2/val_loss/dataloader_idx_2
16
+ "5": 2
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": lr
22
+ "5": 2
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": loader1/val_loss/dataloader_idx_1
28
+ "5": 2
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": loader2/val_loss_ptc/dataloader_idx_2
34
+ "5": 2
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": loader1/val_loss_ptm/dataloader_idx_1
40
+ "5": 2
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": loader2/val_loss_ptm/dataloader_idx_2
46
+ "5": 2
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": train_loss
52
+ "5": 2
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": loader0/val_loss_ptc/dataloader_idx_0
58
+ "5": 2
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": loader0/val_loss_ptm/dataloader_idx_0
64
+ "5": 2
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": loader1/val_loss_ptc/dataloader_idx_1
70
+ "5": 2
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": loader0/val_loss_lm/dataloader_idx_0
76
+ "5": 2
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ - "1": train_loss_ptc
82
+ "5": 2
83
+ "6":
84
+ - 1
85
+ - 3
86
+ "7": []
87
+ - "1": train_loss_ptm
88
+ "5": 2
89
+ "6":
90
+ - 1
91
+ - 3
92
+ "7": []
93
+ - "1": train_loss_lm
94
+ "5": 2
95
+ "6":
96
+ - 1
97
+ - 3
98
+ "7": []
99
+ - "1": epoch
100
+ "5": 2
101
+ "6":
102
+ - 1
103
+ - 3
104
+ "7": []
105
+ - "1": loader0/val_loss/dataloader_idx_0
106
+ "5": 2
107
+ "6":
108
+ - 1
109
+ - 3
110
+ "7": []
111
+ - "1": loader2/val_loss_lm/dataloader_idx_2
112
+ "5": 2
113
+ "6":
114
+ - 1
115
+ - 3
116
+ "7": []
117
+ python_version: 3.10.0
118
+ t:
119
+ "1":
120
+ - 1
121
+ - 5
122
+ - 9
123
+ - 11
124
+ - 33
125
+ - 41
126
+ - 49
127
+ - 53
128
+ - 55
129
+ - 63
130
+ - 103
131
+ "2":
132
+ - 1
133
+ - 5
134
+ - 9
135
+ - 11
136
+ - 33
137
+ - 41
138
+ - 49
139
+ - 53
140
+ - 55
141
+ - 63
142
+ - 103
143
+ "3":
144
+ - 7
145
+ - 23
146
+ - 55
147
+ - 66
148
+ "4": 3.10.0
149
+ "5": 0.19.11
150
+ "6": 4.52.3
151
+ "8":
152
+ - 5
153
+ "12": 0.19.11
154
+ "13": linux-x86_64
155
+ accelerator:
156
+ value: gpu
157
+ batch_size:
158
+ value: 32
159
+ bert_hidden_dim:
160
+ value: 768
161
+ bert_name:
162
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
163
+ check_val_every_n_epoch:
164
+ value: 1
165
+ cross_attention_freq:
166
+ value: 2
167
+ devices:
168
+ value: 0,1,2,3,4,5,6,7
169
+ filename:
170
+ value: stage1_07041727_2dataset
171
+ init_checkpoint:
172
+ value: ""
173
+ init_lr:
174
+ value: 0.0001
175
+ lm:
176
+ value: true
177
+ load_4bit:
178
+ value: false
179
+ lr_decay_rate:
180
+ value: 0.9
181
+ match_batch_size:
182
+ value: 64
183
+ max_epochs:
184
+ value: 30
185
+ min_lr:
186
+ value: 1e-05
187
+ mix_dataset:
188
+ value: true
189
+ mode:
190
+ value: train
191
+ num_query_token:
192
+ value: 8
193
+ num_workers:
194
+ value: 8
195
+ plm_name:
196
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
197
+ plm_tune:
198
+ value: freeze
199
+ pool_size:
200
+ value: 0
201
+ precision:
202
+ value: bf16-mixed
203
+ projection_dim:
204
+ value: 256
205
+ prot_aug:
206
+ value: None
207
+ prot_max_len:
208
+ value: 1024
209
+ ptm:
210
+ value: true
211
+ rerank_cand_num:
212
+ value: 128
213
+ retrieval_eval_epoch:
214
+ value: 10
215
+ root:
216
+ value: data
217
+ save_every_n_epochs:
218
+ value: 5
219
+ scheduler:
220
+ value: linear_warmup_cosine_lr
221
+ seed:
222
+ value: 42
223
+ strategy:
224
+ value: deepspeed
225
+ temperature:
226
+ value: 0.1
227
+ text_max_len:
228
+ value: 128
229
+ use_wandb_logger:
230
+ value: true
231
+ warmup_lr:
232
+ value: 1e-06
233
+ warmup_steps:
234
+ value: 1000
235
+ weight_decay:
236
+ value: 0.05
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/output.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_07041727_2dataset exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ ------------------------------------------------------
7
+ 0 | blip2qformer | Blip2Qformer | 327 M | train
8
+ ------------------------------------------------------
9
+ 179 M Trainable params
10
+ 147 M Non-trainable params
11
+ 327 M Total params
12
+ 1,309.467 Total estimated model params size (MB)
13
+ 5 Modules in train mode
14
+ 926 Modules in eval mode
15
+ Epoch 29: 100%|█████████████████████████████████████████| 3331/3331 [36:16<00:00, 1.53it/s, v_num=zmou]
16
+ /nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py:220: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
17
+ with torch.cuda.amp.autocast(enable_autocast, dtype=torch.float32):
18
+
19
+ /nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
20
+ sd = self.module.state_dict(destination, prefix, keep_vars)
21
+ `Trainer.fit` stopped: `max_epochs=30` reached.
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ attrs==25.3.0
2
+ tqdm==4.67.1
3
+ langcodes==3.5.0
4
+ nvidia-cublas-cu12==12.4.5.8
5
+ tifffile==2025.5.10
6
+ nvidia-cufile-cu12==1.11.1.6
7
+ nltk==3.9.1
8
+ salesforce-lavis==1.0.2
9
+ tzdata==2025.2
10
+ pyparsing==3.2.3
11
+ six==1.17.0
12
+ python-dateutil==2.9.0.post0
13
+ pandas==2.2.3
14
+ pytorch-lightning==2.5.1.post0
15
+ blinker==1.9.0
16
+ opencv-python-headless==4.5.5.64
17
+ nvidia-cusparse-cu12==12.3.1.170
18
+ pytz==2025.2
19
+ async-timeout==5.0.1
20
+ pillow==11.2.1
21
+ parso==0.8.4
22
+ joblib==1.5.1
23
+ contourpy==1.3.2
24
+ triton==3.2.0
25
+ marisa-trie==1.2.1
26
+ PyYAML==6.0.2
27
+ regex==2024.11.6
28
+ idna==3.10
29
+ nvidia-curand-cu12==10.3.5.147
30
+ rpds-py==0.25.1
31
+ aiosignal==1.3.2
32
+ srsly==2.5.1
33
+ confection==0.1.5
34
+ typing-inspection==0.4.1
35
+ packaging==24.2
36
+ distlib==0.3.9
37
+ networkx==3.4.2
38
+ absl-py==2.2.2
39
+ yarl==1.20.0
40
+ lightning-utilities==0.14.3
41
+ executing==2.2.0
42
+ pycocoevalcap==1.2
43
+ wheel==0.45.1
44
+ nvidia-ml-py==12.575.51
45
+ cycler==0.12.1
46
+ wrapt==1.17.2
47
+ jsonschema-specifications==2025.4.1
48
+ protobuf==6.31.0
49
+ mpmath==1.3.0
50
+ certifi==2025.4.26
51
+ py-cpuinfo==9.0.0
52
+ contexttimer==0.3.3
53
+ watchdog==6.0.0
54
+ pexpect==4.9.0
55
+ webencodings==0.5.1
56
+ hf-xet==1.1.2
57
+ cymem==2.0.11
58
+ requests==2.32.3
59
+ timm==0.4.12
60
+ omegaconf==2.3.0
61
+ nvidia-nvjitlink-cu12==12.4.127
62
+ webdataset==0.2.111
63
+ nodeenv==1.9.1
64
+ frozenlist==1.6.0
65
+ annotated-types==0.7.0
66
+ matplotlib-inline==0.1.7
67
+ urllib3==2.4.0
68
+ rich==14.0.0
69
+ GitPython==3.1.44
70
+ lazy_loader==0.4
71
+ msgpack==1.1.0
72
+ prompt_toolkit==3.0.51
73
+ fonttools==4.58.0
74
+ multidict==6.4.4
75
+ blis==1.3.0
76
+ thinc==8.3.6
77
+ nvidia-nvtx-cu12==12.4.127
78
+ torchmetrics==1.7.1
79
+ weasel==0.4.1
80
+ numpy==2.2.6
81
+ cachetools==5.5.2
82
+ Jinja2==3.1.6
83
+ matplotlib==3.10.3
84
+ nvidia-cudnn-cu12==9.1.0.70
85
+ Pygments==2.19.1
86
+ tornado==6.5.1
87
+ scipy==1.15.3
88
+ rouge_score==0.1.2
89
+ cloudpathlib==0.21.1
90
+ jedi==0.19.2
91
+ referencing==0.36.2
92
+ decord==0.6.0
93
+ setuptools==78.1.1
94
+ mdurl==0.1.2
95
+ identify==2.6.12
96
+ python-slugify==8.0.4
97
+ portalocker==3.1.1
98
+ catalogue==2.0.10
99
+ platformdirs==4.3.8
100
+ antlr4-python3-runtime==4.9.3
101
+ nvidia-cusolver-cu12==11.6.1.9
102
+ kaggle==1.7.4.5
103
+ pydeck==0.9.1
104
+ pydantic==2.11.5
105
+ nvidia-cufft-cu12==11.2.1.3
106
+ pyarrow==20.0.0
107
+ nvidia-nccl-cu12==2.21.5
108
+ markdown-it-py==3.0.0
109
+ gitdb==4.0.12
110
+ altair==5.5.0
111
+ torchvision==0.21.0
112
+ python-magic==0.4.27
113
+ iopath==0.1.10
114
+ smart-open==7.1.0
115
+ torch==2.6.0
116
+ pycocotools==2.0.8
117
+ fairscale==0.4.4
118
+ traitlets==5.14.3
119
+ pure_eval==0.2.3
120
+ sympy==1.13.1
121
+ nvidia-cusparselt-cu12==0.6.2
122
+ imageio==2.37.0
123
+ stack-data==0.6.3
124
+ shellingham==1.5.4
125
+ nvidia-cuda-runtime-cu12==12.4.127
126
+ einops==0.8.1
127
+ tenacity==9.1.2
128
+ virtualenv==20.31.2
129
+ ptyprocess==0.7.0
130
+ cfgv==3.4.0
131
+ pre_commit==4.2.0
132
+ language_data==1.3.0
133
+ typing_extensions==4.13.2
134
+ propcache==0.3.1
135
+ nvidia-cuda-cupti-cu12==12.4.127
136
+ safetensors==0.5.3
137
+ text-unidecode==1.3
138
+ wcwidth==0.2.13
139
+ charset-normalizer==3.4.2
140
+ aiohappyeyeballs==2.6.1
141
+ ipython==8.36.0
142
+ streamlit==1.45.1
143
+ asttokens==3.0.0
144
+ psutil==7.0.0
145
+ smmap==5.0.2
146
+ exceptiongroup==1.3.0
147
+ murmurhash==1.0.13
148
+ filelock==3.18.0
149
+ plotly==6.1.1
150
+ hjson==3.1.0
151
+ pydantic_core==2.33.2
152
+ ninja==1.11.1.4
153
+ kiwisolver==1.4.8
154
+ spacy-legacy==3.0.12
155
+ opendatasets==0.1.22
156
+ decorator==5.2.1
157
+ spacy==3.8.7
158
+ wasabi==1.1.3
159
+ sentencepiece==0.2.0
160
+ toml==0.10.2
161
+ scikit-image==0.25.2
162
+ deepspeed==0.16.10+b666844f
163
+ ftfy==6.3.1
164
+ bleach==6.2.0
165
+ nvidia-cuda-nvrtc-cu12==12.4.127
166
+ spacy-loggers==1.0.5
167
+ MarkupSafe==3.0.2
168
+ braceexpand==0.1.7
169
+ oss2==2.15.0
170
+ preshed==3.0.10
171
+ transformers==4.52.3
172
+ aiohttp==3.12.2
173
+ web.py==0.62
174
+ threadpoolctl==3.6.0
175
+ jaraco.functools==4.1.0
176
+ wandb==0.19.11
177
+ sentry-sdk==2.29.1
178
+ tokenizers==0.21.1
179
+ fsspec==2025.3.0
180
+ flash-attn==2.7.1.post1
181
+ opendelta==0.3.2
182
+ opencv-python==4.11.0.86
183
+ click==8.2.1
184
+ docker-pycreds==0.4.0
185
+ typer==0.16.0
186
+ xxhash==3.5.0
187
+ pathlib==1.0.1
188
+ dill==0.3.8
189
+ crcmod==1.7
190
+ bigmodelvis==0.0.1
191
+ datasets==3.6.0
192
+ pycryptodome==3.23.0
193
+ jsonschema==4.24.0
194
+ aliyun-python-sdk-core==2.16.0
195
+ jmespath==0.10.0
196
+ more-itertools==10.7.0
197
+ scikit-learn==1.6.1
198
+ huggingface-hub==0.32.1
199
+ cryptography==45.0.3
200
+ pycparser==2.22
201
+ yacs==0.1.8
202
+ aliyun-python-sdk-kms==2.16.5
203
+ cffi==1.17.1
204
+ delta-center-client==0.0.4
205
+ multiprocess==0.70.16
206
+ setproctitle==1.3.6
207
+ narwhals==1.41.0
208
+ pip==25.1.1
209
+ cheroot==10.0.1
210
+ jaraco.context==5.3.0
211
+ more-itertools==10.3.0
212
+ jaraco.functools==4.0.1
213
+ jaraco.text==3.12.1
214
+ platformdirs==4.2.2
215
+ packaging==24.2
216
+ wheel==0.45.1
217
+ zipp==3.19.2
218
+ inflect==7.3.1
219
+ autocommand==2.2.2
220
+ typeguard==4.3.0
221
+ jaraco.collections==5.1.0
222
+ backports.tarfile==1.2.0
223
+ tomli==2.0.1
224
+ importlib_metadata==8.0.0
225
+ typing_extensions==4.12.2
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-metadata.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-07-04T09:28:55.102499Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_07041727_2dataset",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "30",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger"
30
+ ],
31
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
32
+ "codePath": "stage1.py",
33
+ "email": "gia0603yucca@gmail.com",
34
+ "root": "./all_checkpoints/stage1_07041727_2dataset/",
35
+ "host": "dsw-266702-5bd8569444-hrqd7",
36
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
37
+ "codePathLocal": "stage1.py",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 64,
40
+ "gpu": "NVIDIA A800-SXM4-80GB",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "1623302262784",
45
+ "used": "1260302336"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "549755813888"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 64
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A800-SXM4-80GB",
58
+ "memoryTotal": "85198045184",
59
+ "architecture": "Ampere"
60
+ },
61
+ {
62
+ "name": "NVIDIA A800-SXM4-80GB",
63
+ "memoryTotal": "85198045184",
64
+ "architecture": "Ampere"
65
+ },
66
+ {
67
+ "name": "NVIDIA A800-SXM4-80GB",
68
+ "memoryTotal": "85198045184",
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA A800-SXM4-80GB",
73
+ "memoryTotal": "85198045184",
74
+ "architecture": "Ampere"
75
+ },
76
+ {
77
+ "name": "NVIDIA A800-SXM4-80GB",
78
+ "memoryTotal": "85198045184",
79
+ "architecture": "Ampere"
80
+ },
81
+ {
82
+ "name": "NVIDIA A800-SXM4-80GB",
83
+ "memoryTotal": "85198045184",
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A800-SXM4-80GB",
88
+ "memoryTotal": "85198045184",
89
+ "architecture": "Ampere"
90
+ },
91
+ {
92
+ "name": "NVIDIA A800-SXM4-80GB",
93
+ "memoryTotal": "85198045184",
94
+ "architecture": "Ampere"
95
+ }
96
+ ],
97
+ "cudaVersion": "12.1"
98
+ }
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"loader2/val_loss/dataloader_idx_2":13.163299560546875,"loader2/val_loss_ptm/dataloader_idx_2":2.123929738998413,"_step":2027,"loader1/val_loss_ptc/dataloader_idx_1":0.5523239970207214,"lr":1.0246513738820795e-05,"epoch":29,"_runtime":65267.979623679,"loader0/val_loss_ptm/dataloader_idx_0":0.3546168804168701,"train_loss_lm":1.42578125,"train_loss_ptc":0.218505859375,"loader0/val_loss/dataloader_idx_0":2.6084561347961426,"_timestamp":1.7516866030817497e+09,"loader2/val_loss_lm/dataloader_idx_2":7.963825225830078,"_wandb":{"runtime":65274},"loader1/val_loss/dataloader_idx_1":2.692500114440918,"loader1/val_loss_lm/dataloader_idx_1":1.6366312503814697,"loader2/val_loss_ptc/dataloader_idx_2":3.075716018676758,"train_loss_ptm":0.2022705078125,"loader0/val_loss_lm/dataloader_idx_0":1.9205952882766724,"loader1/val_loss_ptm/dataloader_idx_1":0.50240159034729,"loader0/val_loss_ptc/dataloader_idx_0":0.3330143988132477,"trainer/global_step":99929,"train_loss":1.845703125}
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-04T17:28:55.160594539+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-core.log"}
2
+ {"time":"2025-07-04T17:28:56.249328974+08:00","level":"INFO","msg":"created new stream","id":"6bkqzmou"}
3
+ {"time":"2025-07-04T17:28:56.249372351+08:00","level":"INFO","msg":"stream: started","id":"6bkqzmou"}
4
+ {"time":"2025-07-04T17:28:56.249400451+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"6bkqzmou"}
5
+ {"time":"2025-07-04T17:28:56.249431272+08:00","level":"INFO","msg":"sender: started","stream_id":"6bkqzmou"}
6
+ {"time":"2025-07-04T17:28:56.249469216+08:00","level":"INFO","msg":"handler: started","stream_id":"6bkqzmou"}
7
+ {"time":"2025-07-04T17:28:57.491653525+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-07-04T22:06:52.200518707+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
9
+ {"time":"2025-07-04T22:09:45.338273816+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
10
+ {"time":"2025-07-04T22:19:20.574743081+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59078->172.67.193.61:443: read: connection timed out"}
11
+ {"time":"2025-07-04T22:25:54.288016702+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42132->172.67.193.61:443: read: connection timed out"}
12
+ {"time":"2025-07-04T22:29:40.591991523+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37544->172.67.193.61:443: read: connection timed out"}
13
+ {"time":"2025-07-04T22:36:54.256091094+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40968->172.67.193.61:443: read: connection timed out"}
14
+ {"time":"2025-07-04T22:37:22.364944108+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
15
+ {"time":"2025-07-04T22:40:24.499117928+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
16
+ {"time":"2025-07-04T22:40:51.249223858+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
17
+ {"time":"2025-07-04T22:44:05.872015851+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59778->172.67.193.61:443: read: connection timed out"}
18
+ {"time":"2025-07-04T22:49:18.192032141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:47358->104.21.20.172:443: read: connection timed out"}
19
+ {"time":"2025-07-04T22:52:13.295997002+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46182->172.67.193.61:443: read: connection timed out"}
20
+ {"time":"2025-07-04T22:53:26.345699486+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
21
+ {"time":"2025-07-04T22:55:37.691524069+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
22
+ {"time":"2025-07-04T22:59:01.477384402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
23
+ {"time":"2025-07-04T23:01:22.224282887+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:45222->172.67.193.61:443: read: connection reset by peer"}
24
+ {"time":"2025-07-04T23:06:44.720013857+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46280->104.21.20.172:443: read: connection timed out"}
25
+ {"time":"2025-07-04T23:08:42.894770628+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
26
+ {"time":"2025-07-04T23:10:13.616061547+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57034->172.67.193.61:443: read: connection timed out"}
27
+ {"time":"2025-07-04T23:11:27.896127402+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
28
+ {"time":"2025-07-04T23:15:47.19805854+08:00","level":"ERROR","msg":"filestream: json decode error: net/http: request canceled (Client.Timeout or context cancellation while reading body)"}
29
+ {"time":"2025-07-04T23:15:47.222866077+08:00","level":"ERROR","msg":"filestream: error closing response body: net/http: request canceled"}
30
+ {"time":"2025-07-04T23:19:26.063989295+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:35504->172.67.193.61:443: read: connection timed out"}
31
+ {"time":"2025-07-04T23:21:57.905369451+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
32
+ {"time":"2025-07-04T23:23:00.080992848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38378->172.67.193.61:443: read: connection timed out"}
33
+ {"time":"2025-07-04T23:26:54.577250259+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:41928->104.21.20.172:443: read: connection reset by peer"}
34
+ {"time":"2025-07-04T23:28:47.703904029+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38040->172.67.193.61:443: read: connection reset by peer"}
35
+ {"time":"2025-07-04T23:30:12.910139882+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
36
+ {"time":"2025-07-04T23:30:45.313312591+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
37
+ {"time":"2025-07-04T23:35:25.039973358+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:38402->104.21.20.172:443: read: connection timed out"}
38
+ {"time":"2025-07-04T23:39:27.49206097+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39926->172.67.193.61:443: read: connection reset by peer"}
39
+ {"time":"2025-07-04T23:43:09.424012888+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59448->172.67.193.61:443: read: connection timed out"}
40
+ {"time":"2025-07-04T23:46:07.600020006+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33610->172.67.193.61:443: read: connection timed out"}
41
+ {"time":"2025-07-04T23:46:28.951111977+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": unexpected EOF"}
42
+ {"time":"2025-07-04T23:48:12.919414088+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
43
+ {"time":"2025-07-04T23:48:45.403207458+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
44
+ {"time":"2025-07-04T23:49:16.527984782+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:49472->172.67.193.61:443: read: connection timed out"}
45
+ {"time":"2025-07-04T23:50:27.921623046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
46
+ {"time":"2025-07-04T23:52:57.899934024+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40636->172.67.193.61:443: read: connection reset by peer"}
47
+ {"time":"2025-07-04T23:56:18.928962652+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53056->172.67.193.61:443: read: connection timed out"}
48
+ {"time":"2025-07-04T23:56:57.924908638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
49
+ {"time":"2025-07-04T23:57:30.375318804+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
50
+ {"time":"2025-07-04T23:58:04.586629939+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
51
+ {"time":"2025-07-05T00:03:09.552010125+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39630->172.67.193.61:443: read: connection timed out"}
52
+ {"time":"2025-07-05T00:03:42.930344983+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
53
+ {"time":"2025-07-05T00:04:15.375941679+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
54
+ {"time":"2025-07-05T00:04:21.655905995+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:48150->172.67.193.61:443: read: connection reset by peer"}
55
+ {"time":"2025-07-05T00:04:50.22664016+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
56
+ {"time":"2025-07-05T00:08:01.391966638+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:53260->104.21.20.172:443: read: connection timed out"}
57
+ {"time":"2025-07-05T00:12:23.023992865+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:44698->172.67.193.61:443: read: connection timed out"}
58
+ {"time":"2025-07-05T00:15:42.93731147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
59
+ {"time":"2025-07-05T00:16:15.214992648+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
60
+ {"time":"2025-07-05T00:16:49.667525584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
61
+ {"time":"2025-07-05T00:17:01.040050871+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:42760->172.67.193.61:443: read: connection timed out"}
62
+ {"time":"2025-07-05T00:20:02.288062562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:37934->104.21.20.172:443: read: connection timed out"}
63
+ {"time":"2025-07-05T00:23:13.264033499+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:51502->172.67.193.61:443: read: connection timed out"}
64
+ {"time":"2025-07-05T00:26:32.944001316+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:50460->172.67.193.61:443: read: connection timed out"}
65
+ {"time":"2025-07-05T00:30:17.199971228+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:43756->104.21.20.172:443: read: connection timed out"}
66
+ {"time":"2025-07-05T00:35:42.94748626+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
67
+ {"time":"2025-07-05T00:35:44.881161178+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59610->172.67.193.61:443: read: connection timed out"}
68
+ {"time":"2025-07-05T00:39:14.287974585+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:46688->104.21.20.172:443: read: connection timed out"}
69
+ {"time":"2025-07-05T00:39:33.427697791+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:40430->104.21.20.172:443: read: connection reset by peer"}
70
+ {"time":"2025-07-05T00:40:47.648388331+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:60164->172.67.193.61:443: read: connection reset by peer"}
71
+ {"time":"2025-07-05T00:42:34.088456552+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52978->104.21.20.172:443: read: connection reset by peer"}
72
+ {"time":"2025-07-05T00:47:13.006425282+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33354->172.67.193.61:443: read: connection reset by peer"}
73
+ {"time":"2025-07-05T00:49:55.823998082+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:33330->172.67.193.61:443: read: connection timed out"}
74
+ {"time":"2025-07-05T00:52:51.439993456+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:52416->172.67.193.61:443: read: connection timed out"}
75
+ {"time":"2025-07-05T00:57:07.440983899+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:39188->172.67.193.61:443: read: connection timed out"}
76
+ {"time":"2025-07-05T01:04:51.312039238+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59436->172.67.193.61:443: read: connection timed out"}
77
+ {"time":"2025-07-05T01:09:40.080000713+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:34348->172.67.193.61:443: read: connection timed out"}
78
+ {"time":"2025-07-05T01:18:31.535996696+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:57184->104.21.20.172:443: read: connection timed out"}
79
+ {"time":"2025-07-05T01:25:12.431983593+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_07041727_2dataset/6bkqzmou/file_stream\": read tcp 10.1.9.185:59852->172.67.193.61:443: read: connection timed out"}
80
+ {"time":"2025-07-05T11:36:49.210079644+08:00","level":"INFO","msg":"stream: closing","id":"6bkqzmou"}
81
+ {"time":"2025-07-05T11:36:49.210163239+08:00","level":"INFO","msg":"Stopping system monitor"}
82
+ {"time":"2025-07-05T11:36:49.211103046+08:00","level":"INFO","msg":"Stopped system monitor"}
83
+ {"time":"2025-07-05T11:36:51.804545543+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
84
+ {"time":"2025-07-05T11:36:53.755788884+08:00","level":"INFO","msg":"handler: closed","stream_id":"6bkqzmou"}
85
+ {"time":"2025-07-05T11:36:53.755828602+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"6bkqzmou"}
86
+ {"time":"2025-07-05T11:36:53.75584333+08:00","level":"INFO","msg":"sender: closed","stream_id":"6bkqzmou"}
87
+ {"time":"2025-07-05T11:36:53.759902053+08:00","level":"INFO","msg":"stream: closed","id":"6bkqzmou"}
ProtT3/all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Configure stats pid to 29356
3
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug.log
7
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_07041727_2dataset/wandb/run-20250704_172854-6bkqzmou/logs/debug-internal.log
8
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():893] starting backend
12
+ 2025-07-04 17:28:55,100 INFO MainThread:29356 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-04 17:28:55,102 INFO MainThread:29356 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-04 17:28:55,102 INFO MainThread:29356 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-04 17:28:55,103 INFO MainThread:29356 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-04 17:28:57,453 INFO MainThread:29356 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-04 17:28:57,668 INFO MainThread:29356 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-04 17:28:57,669 INFO MainThread:29356 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-04 17:28:57,678 INFO MainThread:29356 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-04 17:28:57,684 INFO MainThread:29356 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-04 17:28:57,686 INFO MainThread:29356 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-07-04 17:29:03,015 INFO MainThread:29356 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage1_07041727_2dataset', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 30, 'check_val_every_n_epoch': 1, 'use_wandb_logger': True, 'mix_dataset': True, 'temperature': 0.1, 'save_every_n_epochs': 5, 'ptm': True, 'lm': True, 'rerank_cand_num': 128, 'plm_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'load_4bit': False, 'pool_size': 0, 'bert_hidden_dim': 768, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'projection_dim': 256, 'cross_attention_freq': 2, 'num_query_token': 8, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'init_checkpoint': '', 'retrieval_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'match_batch_size': 64, 'root': 'data', 'text_max_len': 128, 'prot_max_len': 1024, 'prot_aug': 'None'}
24
+ 2025-07-05 11:36:49,208 INFO MsgRouterThr:29356 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_ckpt/wandb/debug-internal.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T22:23:55.245053432+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-core.log"}
2
+ {"time":"2025-06-28T22:23:56.442748363+08:00","level":"INFO","msg":"created new stream","id":"e9wtzwz1"}
3
+ {"time":"2025-06-28T22:23:56.442786823+08:00","level":"INFO","msg":"stream: started","id":"e9wtzwz1"}
4
+ {"time":"2025-06-28T22:23:56.44283909+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"e9wtzwz1"}
5
+ {"time":"2025-06-28T22:23:56.44287811+08:00","level":"INFO","msg":"sender: started","stream_id":"e9wtzwz1"}
6
+ {"time":"2025-06-28T22:23:56.442850569+08:00","level":"INFO","msg":"handler: started","stream_id":"e9wtzwz1"}
7
+ {"time":"2025-06-28T22:23:57.657067842+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-28T23:13:08.786733475+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:41088->172.67.193.61:443: read: connection timed out"}
9
+ {"time":"2025-06-28T23:16:23.858735046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:58168->104.21.20.172:443: read: connection timed out"}
10
+ {"time":"2025-06-28T23:20:12.333412842+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:50886->172.67.193.61:443: read: connection reset by peer"}
11
+ {"time":"2025-06-28T23:28:29.895934993+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": unexpected EOF"}
12
+ {"time":"2025-06-28T23:32:39.731699923+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:54496->104.21.20.172:443: read: connection timed out"}
13
+ {"time":"2025-06-28T23:35:17.938724051+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:33406->172.67.193.61:443: read: connection timed out"}
14
+ {"time":"2025-06-28T23:38:54.515701632+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:59930->172.67.193.61:443: read: connection timed out"}
15
+ {"time":"2025-06-28T23:41:28.010949965+08:00","level":"INFO","msg":"stream: closing","id":"e9wtzwz1"}
16
+ {"time":"2025-06-28T23:41:28.011132748+08:00","level":"INFO","msg":"Stopping system monitor"}
17
+ {"time":"2025-06-28T23:41:28.066664522+08:00","level":"INFO","msg":"Stopped system monitor"}
18
+ {"time":"2025-06-28T23:41:37.996261564+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
19
+ {"time":"2025-06-28T23:42:36.21077519+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:36806->172.67.193.61:443: read: connection timed out"}
ProtT3/all_checkpoints/stage1_ckpt/wandb/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Configure stats pid to 3589
3
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log
7
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log
8
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 22:23:55,237 INFO MainThread:3589 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 22:23:55,239 INFO MainThread:3589 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 22:23:55,240 INFO MainThread:3589 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 22:23:55,249 INFO MainThread:3589 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 22:23:57,648 INFO MainThread:3589 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-28 22:23:57,829 INFO MainThread:3589 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-28 23:41:27,993 INFO MsgRouterThr:3589 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/config.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 33
39
+ - 55
40
+ - 66
41
+ "4": 3.10.0
42
+ "5": 0.19.11
43
+ "6": 4.52.3
44
+ "8":
45
+ - 5
46
+ "12": 0.19.11
47
+ "13": linux-x86_64
48
+ accelerator:
49
+ value: gpu
50
+ batch_size:
51
+ value: 32
52
+ bert_hidden_dim:
53
+ value: 768
54
+ bert_name:
55
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
56
+ check_val_every_n_epoch:
57
+ value: 1
58
+ cross_attention_freq:
59
+ value: 2
60
+ devices:
61
+ value: 0,1,2,3,4,5,6,7
62
+ filename:
63
+ value: stage1_ckpt
64
+ init_checkpoint:
65
+ value: ""
66
+ init_lr:
67
+ value: 0.0001
68
+ lm:
69
+ value: true
70
+ load_4bit:
71
+ value: false
72
+ lr_decay_rate:
73
+ value: 0.9
74
+ match_batch_size:
75
+ value: 64
76
+ max_epochs:
77
+ value: 20
78
+ min_lr:
79
+ value: 1e-05
80
+ mix_dataset:
81
+ value: true
82
+ mode:
83
+ value: train
84
+ num_query_token:
85
+ value: 8
86
+ num_workers:
87
+ value: 8
88
+ plm_name:
89
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
90
+ plm_tune:
91
+ value: freeze
92
+ pool_size:
93
+ value: 0
94
+ precision:
95
+ value: bf16-mixed
96
+ projection_dim:
97
+ value: 256
98
+ prot_aug:
99
+ value: None
100
+ prot_max_len:
101
+ value: 1024
102
+ ptm:
103
+ value: true
104
+ rerank_cand_num:
105
+ value: 128
106
+ retrieval_eval_epoch:
107
+ value: 10
108
+ root:
109
+ value: data_small
110
+ save_every_n_epochs:
111
+ value: 5
112
+ scheduler:
113
+ value: linear_warmup_cosine_lr
114
+ seed:
115
+ value: 42
116
+ strategy:
117
+ value: ddp
118
+ temperature:
119
+ value: 0.1
120
+ text_max_len:
121
+ value: 128
122
+ use_wandb_logger:
123
+ value: true
124
+ warmup_lr:
125
+ value: 1e-06
126
+ warmup_steps:
127
+ value: 1000
128
+ weight_decay:
129
+ value: 0.05
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/output.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ Detected KeyboardInterrupt, attempting graceful shutdown ...
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ marisa-trie==1.2.1
2
+ pydantic==2.11.5
3
+ mdurl==0.1.2
4
+ gitdb==4.0.12
5
+ scikit-image==0.25.2
6
+ async-timeout==5.0.1
7
+ blis==1.3.0
8
+ urllib3==2.4.0
9
+ spacy==3.8.7
10
+ nvidia-ml-py==12.575.51
11
+ braceexpand==0.1.7
12
+ nvidia-cufft-cu12==11.2.1.3
13
+ rich==14.0.0
14
+ setuptools==78.1.1
15
+ matplotlib==3.10.3
16
+ catalogue==2.0.10
17
+ decord==0.6.0
18
+ numpy==2.2.6
19
+ charset-normalizer==3.4.2
20
+ langcodes==3.5.0
21
+ pexpect==4.9.0
22
+ nltk==3.9.1
23
+ cachetools==5.5.2
24
+ cfgv==3.4.0
25
+ prompt_toolkit==3.0.51
26
+ srsly==2.5.1
27
+ einops==0.8.1
28
+ Jinja2==3.1.6
29
+ cloudpathlib==0.21.1
30
+ streamlit==1.45.1
31
+ pydantic_core==2.33.2
32
+ tornado==6.5.1
33
+ nvidia-curand-cu12==10.3.5.147
34
+ deepspeed==0.16.10+b666844f
35
+ networkx==3.4.2
36
+ omegaconf==2.3.0
37
+ msgpack==1.1.0
38
+ pandas==2.2.3
39
+ rouge_score==0.1.2
40
+ six==1.17.0
41
+ language_data==1.3.0
42
+ referencing==0.36.2
43
+ rpds-py==0.25.1
44
+ lazy_loader==0.4
45
+ pydeck==0.9.1
46
+ markdown-it-py==3.0.0
47
+ fonttools==4.58.0
48
+ nvidia-cuda-runtime-cu12==12.4.127
49
+ smart-open==7.1.0
50
+ identify==2.6.12
51
+ pure_eval==0.2.3
52
+ confection==0.1.5
53
+ nvidia-cublas-cu12==12.4.5.8
54
+ nvidia-cusparselt-cu12==0.6.2
55
+ decorator==5.2.1
56
+ nvidia-nccl-cu12==2.21.5
57
+ pytz==2025.2
58
+ nvidia-cudnn-cu12==9.1.0.70
59
+ plotly==6.1.1
60
+ safetensors==0.5.3
61
+ portalocker==3.1.1
62
+ toml==0.10.2
63
+ triton==3.2.0
64
+ cycler==0.12.1
65
+ torch==2.6.0
66
+ python-magic==0.4.27
67
+ ptyprocess==0.7.0
68
+ regex==2024.11.6
69
+ absl-py==2.2.2
70
+ psutil==7.0.0
71
+ murmurhash==1.0.13
72
+ wrapt==1.17.2
73
+ pycocoevalcap==1.2
74
+ python-slugify==8.0.4
75
+ stack-data==0.6.3
76
+ python-dateutil==2.9.0.post0
77
+ scipy==1.15.3
78
+ annotated-types==0.7.0
79
+ mpmath==1.3.0
80
+ ipython==8.36.0
81
+ pyparsing==3.2.3
82
+ nvidia-nvtx-cu12==12.4.127
83
+ fairscale==0.4.4
84
+ jsonschema-specifications==2025.4.1
85
+ matplotlib-inline==0.1.7
86
+ watchdog==6.0.0
87
+ thinc==8.3.6
88
+ antlr4-python3-runtime==4.9.3
89
+ webencodings==0.5.1
90
+ hjson==3.1.0
91
+ propcache==0.3.1
92
+ virtualenv==20.31.2
93
+ pytorch-lightning==2.5.1.post0
94
+ Pygments==2.19.1
95
+ pillow==11.2.1
96
+ joblib==1.5.1
97
+ tqdm==4.67.1
98
+ timm==0.4.12
99
+ nvidia-nvjitlink-cu12==12.4.127
100
+ aiosignal==1.3.2
101
+ kaggle==1.7.4.5
102
+ idna==3.10
103
+ pycocotools==2.0.8
104
+ MarkupSafe==3.0.2
105
+ traitlets==5.14.3
106
+ multidict==6.4.4
107
+ distlib==0.3.9
108
+ torchmetrics==1.7.1
109
+ pyarrow==20.0.0
110
+ tzdata==2025.2
111
+ platformdirs==4.3.8
112
+ yarl==1.20.0
113
+ tenacity==9.1.2
114
+ altair==5.5.0
115
+ wasabi==1.1.3
116
+ attrs==25.3.0
117
+ contourpy==1.3.2
118
+ kiwisolver==1.4.8
119
+ PyYAML==6.0.2
120
+ exceptiongroup==1.3.0
121
+ jedi==0.19.2
122
+ sentencepiece==0.2.0
123
+ nvidia-cusolver-cu12==11.6.1.9
124
+ requests==2.32.3
125
+ opendatasets==0.1.22
126
+ GitPython==3.1.44
127
+ bleach==6.2.0
128
+ protobuf==6.31.0
129
+ sympy==1.13.1
130
+ filelock==3.18.0
131
+ pre_commit==4.2.0
132
+ text-unidecode==1.3
133
+ wheel==0.45.1
134
+ contexttimer==0.3.3
135
+ wcwidth==0.2.13
136
+ spacy-legacy==3.0.12
137
+ aiohappyeyeballs==2.6.1
138
+ imageio==2.37.0
139
+ nodeenv==1.9.1
140
+ py-cpuinfo==9.0.0
141
+ hf-xet==1.1.2
142
+ nvidia-cuda-cupti-cu12==12.4.127
143
+ weasel==0.4.1
144
+ certifi==2025.4.26
145
+ lightning-utilities==0.14.3
146
+ typing_extensions==4.13.2
147
+ typing-inspection==0.4.1
148
+ webdataset==0.2.111
149
+ nvidia-cusparse-cu12==12.3.1.170
150
+ asttokens==3.0.0
151
+ nvidia-cufile-cu12==1.11.1.6
152
+ opencv-python-headless==4.5.5.64
153
+ smmap==5.0.2
154
+ tifffile==2025.5.10
155
+ iopath==0.1.10
156
+ packaging==24.2
157
+ cymem==2.0.11
158
+ spacy-loggers==1.0.5
159
+ ninja==1.11.1.4
160
+ ftfy==6.3.1
161
+ executing==2.2.0
162
+ nvidia-cuda-nvrtc-cu12==12.4.127
163
+ blinker==1.9.0
164
+ torchvision==0.21.0
165
+ parso==0.8.4
166
+ salesforce-lavis==1.0.2
167
+ frozenlist==1.6.0
168
+ shellingham==1.5.4
169
+ flash-attn==2.7.1.post1
170
+ pycparser==2.22
171
+ threadpoolctl==3.6.0
172
+ opencv-python==4.11.0.86
173
+ fsspec==2025.3.0
174
+ aiohttp==3.12.2
175
+ narwhals==1.41.0
176
+ opendelta==0.3.2
177
+ pycryptodome==3.23.0
178
+ crcmod==1.7
179
+ delta-center-client==0.0.4
180
+ tokenizers==0.21.1
181
+ aliyun-python-sdk-kms==2.16.5
182
+ more-itertools==10.7.0
183
+ yacs==0.1.8
184
+ bigmodelvis==0.0.1
185
+ jmespath==0.10.0
186
+ docker-pycreds==0.4.0
187
+ web.py==0.62
188
+ scikit-learn==1.6.1
189
+ pip==25.1.1
190
+ cheroot==10.0.1
191
+ setproctitle==1.3.6
192
+ huggingface-hub==0.32.1
193
+ oss2==2.15.0
194
+ cryptography==45.0.3
195
+ typer==0.16.0
196
+ xxhash==3.5.0
197
+ jsonschema==4.24.0
198
+ click==8.2.1
199
+ preshed==3.0.10
200
+ sentry-sdk==2.29.1
201
+ wandb==0.19.11
202
+ dill==0.3.8
203
+ aliyun-python-sdk-core==2.16.0
204
+ transformers==4.52.3
205
+ cffi==1.17.1
206
+ pathlib==1.0.1
207
+ jaraco.functools==4.1.0
208
+ datasets==3.6.0
209
+ multiprocess==0.70.16
210
+ backports.tarfile==1.2.0
211
+ tomli==2.0.1
212
+ autocommand==2.2.2
213
+ zipp==3.19.2
214
+ jaraco.text==3.12.1
215
+ jaraco.collections==5.1.0
216
+ platformdirs==4.2.2
217
+ typeguard==4.3.0
218
+ typing_extensions==4.12.2
219
+ jaraco.functools==4.0.1
220
+ inflect==7.3.1
221
+ wheel==0.45.1
222
+ more-itertools==10.3.0
223
+ importlib_metadata==8.0.0
224
+ jaraco.context==5.3.0
225
+ packaging==24.2