yuccaaa commited on
Commit
bf3c957
·
verified ·
1 Parent(s): 0ecd035

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_token_acc.png +0 -0
  2. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_total_flos.png +0 -0
  3. BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_train_loss.png +0 -0
  4. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log +63 -0
  5. ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log +23 -0
  6. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-metadata.json +100 -0
  7. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-summary.json +1 -0
  8. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-internal.log +15 -0
  9. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug.log +23 -0
  10. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/config.yaml +129 -0
  11. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/output.log +145 -0
  12. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/requirements.txt +225 -0
  13. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-metadata.json +100 -0
  14. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-summary.json +1 -0
  15. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-internal.log +15 -0
  16. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug.log +23 -0
  17. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/run-rrhzb5iq.wandb +0 -0
  18. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/output.log +0 -0
  19. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/requirements.txt +225 -0
  20. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/wandb-metadata.json +100 -0
  21. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-internal.log +7 -0
  22. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug.log +22 -0
  23. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/run-qflz8r5n.wandb +0 -0
  24. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-internal.log +5 -0
  25. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug.log +94 -0
  26. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/config.yaml +429 -0
  27. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/output.log +2 -0
  28. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/requirements.txt +225 -0
  29. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-metadata.json +100 -0
  30. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-summary.json +1 -0
  31. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log +19 -0
  32. ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log +23 -0
  33. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/files/wandb-metadata.json +107 -0
  34. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-internal.log +7 -0
  35. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug.log +22 -0
  36. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/output.log +4 -0
  37. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/requirements.txt +225 -0
  38. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/wandb-metadata.json +107 -0
  39. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-internal.log +7 -0
  40. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug.log +22 -0
  41. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/run-qhvlkre6.wandb +0 -0
  42. ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_200229-yex1pcwt/files/config.yaml +216 -0
  43. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug.log +24 -0
  44. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/config.yaml +222 -0
  45. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/output.log +35 -0
  46. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/requirements.txt +225 -0
  47. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-metadata.json +104 -0
  48. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-summary.json +1 -0
  49. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-internal.log +95 -0
  50. ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug.log +24 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_token_acc.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_total_flos.png ADDED
BIO/sft/qwen-production-08022302/v0-20250802-230250/images/train_train_loss.png ADDED
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-29T00:07:02.130913564+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-core.log"}
2
+ {"time":"2025-06-29T00:07:16.339720801+08:00","level":"INFO","msg":"created new stream","id":"rypk39yq"}
3
+ {"time":"2025-06-29T00:07:16.340562919+08:00","level":"INFO","msg":"stream: started","id":"rypk39yq"}
4
+ {"time":"2025-06-29T00:07:16.340584288+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"rypk39yq"}
5
+ {"time":"2025-06-29T00:07:16.340617888+08:00","level":"INFO","msg":"sender: started","stream_id":"rypk39yq"}
6
+ {"time":"2025-06-29T00:07:16.340654242+08:00","level":"INFO","msg":"handler: started","stream_id":"rypk39yq"}
7
+ {"time":"2025-06-29T00:07:28.033909694+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-29T00:12:24.114755958+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52688->104.21.20.172:443: read: connection timed out"}
9
+ {"time":"2025-06-29T00:15:17.682707235+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:43992->104.21.20.172:443: read: connection timed out"}
10
+ {"time":"2025-06-29T00:16:13.20335199+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-06-29T00:16:45.525802023+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-06-29T00:17:19.98711773+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
13
+ {"time":"2025-06-29T00:18:06.642780387+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:36080->172.67.193.61:443: read: connection timed out"}
14
+ {"time":"2025-06-29T00:22:43.123257688+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:52664->172.67.193.61:443: read: connection timed out"}
15
+ {"time":"2025-06-29T00:26:08.434737599+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42534->172.67.193.61:443: read: connection timed out"}
16
+ {"time":"2025-06-29T00:27:44.454100719+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53006->104.21.20.172:443: read: connection reset by peer"}
17
+ {"time":"2025-06-29T00:29:13.211268181+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
18
+ {"time":"2025-06-29T00:29:45.68436365+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
19
+ {"time":"2025-06-29T00:30:19.759580601+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
20
+ {"time":"2025-06-29T00:30:33.650730605+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:38754->172.67.193.61:443: read: connection timed out"}
21
+ {"time":"2025-06-29T00:30:58.011093426+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
22
+ {"time":"2025-06-29T00:34:39.922752645+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:35350->172.67.193.61:443: read: connection timed out"}
23
+ {"time":"2025-06-29T00:36:41.88529828+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
24
+ {"time":"2025-06-29T00:37:20.878368218+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:46470->104.21.20.172:443: read: connection reset by peer"}
25
+ {"time":"2025-06-29T00:38:49.414424011+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
26
+ {"time":"2025-06-29T00:38:58.216757113+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
27
+ {"time":"2025-06-29T00:39:20.141003198+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:53708->104.21.20.172:443: read: connection reset by peer"}
28
+ {"time":"2025-06-29T00:41:33.299264534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44198->104.21.20.172:443: read: connection reset by peer"}
29
+ {"time":"2025-06-29T00:47:37.138754922+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39138->172.67.193.61:443: read: connection timed out"}
30
+ {"time":"2025-06-29T00:54:28.224811124+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
31
+ {"time":"2025-06-29T00:55:15.429710397+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:49584->104.21.20.172:443: read: connection reset by peer"}
32
+ {"time":"2025-06-29T00:55:36.251525534+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:55184->104.21.20.172:443: read: connection reset by peer"}
33
+ {"time":"2025-06-29T00:56:12.092902722+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": http2: client conn is closed"}
34
+ {"time":"2025-06-29T00:59:32.604209299+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:44582->172.67.193.61:443: read: connection reset by peer"}
35
+ {"time":"2025-06-29T01:00:43.231046844+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
36
+ {"time":"2025-06-29T01:05:28.234577388+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
37
+ {"time":"2025-06-29T01:06:00.428439859+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
38
+ {"time":"2025-06-29T01:06:35.403033399+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
39
+ {"time":"2025-06-29T01:07:13.835463934+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
40
+ {"time":"2025-06-29T01:12:30.014897464+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": unexpected EOF"}
41
+ {"time":"2025-06-29T01:14:58.239397356+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
42
+ {"time":"2025-06-29T01:15:30.658073848+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
43
+ {"time":"2025-06-29T01:16:05.133874663+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
44
+ {"time":"2025-06-29T01:16:43.256922452+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
45
+ {"time":"2025-06-29T01:17:07.122753765+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:42208->172.67.193.61:443: read: connection timed out"}
46
+ {"time":"2025-06-29T01:17:31.631854783+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
47
+ {"time":"2025-06-29T01:18:38.479583401+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
48
+ {"time":"2025-06-29T01:20:08.481626584+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
49
+ {"time":"2025-06-29T01:21:38.483904393+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
50
+ {"time":"2025-06-29T01:22:09.185192206+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
51
+ {"time":"2025-06-29T01:28:06.578759778+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream\": read tcp 10.1.6.17:39408->172.67.193.61:443: read: connection timed out"}
52
+ {"time":"2025-06-29T02:00:40.766530394+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.bandw.top/graphql","body":"error code: 502"}
53
+ {"time":"2025-06-29T08:45:43.611887283+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
54
+ {"time":"2025-06-29T08:45:55.061157169+08:00","level":"INFO","msg":"api: retrying HTTP error","status":520,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 520"}
55
+ {"time":"2025-06-29T08:51:23.638432293+08:00","level":"INFO","msg":"api: retrying HTTP error","status":524,"url":"https://api.bandw.top/files/gia0603yucca/stage1_06282348_ddp/rypk39yq/file_stream","body":"error code: 524"}
56
+ {"time":"2025-06-29T10:16:08.309722526+08:00","level":"INFO","msg":"stream: closing","id":"rypk39yq"}
57
+ {"time":"2025-06-29T10:16:08.309813211+08:00","level":"INFO","msg":"Stopping system monitor"}
58
+ {"time":"2025-06-29T10:16:08.311047133+08:00","level":"INFO","msg":"Stopped system monitor"}
59
+ {"time":"2025-06-29T10:16:10.887637294+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
60
+ {"time":"2025-06-29T10:16:11.831362524+08:00","level":"INFO","msg":"handler: closed","stream_id":"rypk39yq"}
61
+ {"time":"2025-06-29T10:16:11.831401295+08:00","level":"INFO","msg":"sender: closed","stream_id":"rypk39yq"}
62
+ {"time":"2025-06-29T10:16:11.831391+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rypk39yq"}
63
+ {"time":"2025-06-29T10:16:11.835883161+08:00","level":"INFO","msg":"stream: closed","id":"rypk39yq"}
ProtT3/all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Configure stats pid to 938398
3
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug.log
7
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_06282348_ddp/wandb/run-20250629_000702-rypk39yq/logs/debug-internal.log
8
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():893] starting backend
12
+ 2025-06-29 00:07:02,120 INFO MainThread:938398 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-29 00:07:02,122 INFO MainThread:938398 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-29 00:07:02,125 INFO MainThread:938398 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-29 00:07:02,126 INFO MainThread:938398 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-29 00:07:02,129 INFO MainThread:938398 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-29 00:07:27,982 INFO MainThread:938398 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-29 00:07:28,171 INFO MainThread:938398 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-29 00:07:28,172 INFO MainThread:938398 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-29 00:07:28,176 INFO MainThread:938398 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-29 00:07:28,177 INFO MainThread:938398 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-29 10:16:08,240 INFO MsgRouterThr:938398 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-metadata.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T12:54:50.854308Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_ckpt",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "32",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger",
30
+ "--strategy",
31
+ "ddp"
32
+ ],
33
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
34
+ "codePath": "stage1.py",
35
+ "email": "gia0603yucca@gmail.com",
36
+ "root": "./all_checkpoints/stage1_ckpt/",
37
+ "host": "dsw-265304-558499d597-hhhs7",
38
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
39
+ "codePathLocal": "stage1.py",
40
+ "cpu_count": 64,
41
+ "cpu_count_logical": 64,
42
+ "gpu": "NVIDIA A800-SXM4-80GB",
43
+ "gpu_count": 8,
44
+ "disk": {
45
+ "/": {
46
+ "total": "1623302262784",
47
+ "used": "11399286784"
48
+ }
49
+ },
50
+ "memory": {
51
+ "total": "549755813888"
52
+ },
53
+ "cpu": {
54
+ "count": 64,
55
+ "countLogical": 64
56
+ },
57
+ "gpu_nvidia": [
58
+ {
59
+ "name": "NVIDIA A800-SXM4-80GB",
60
+ "memoryTotal": "85198045184",
61
+ "architecture": "Ampere"
62
+ },
63
+ {
64
+ "name": "NVIDIA A800-SXM4-80GB",
65
+ "memoryTotal": "85198045184",
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A800-SXM4-80GB",
70
+ "memoryTotal": "85198045184",
71
+ "architecture": "Ampere"
72
+ },
73
+ {
74
+ "name": "NVIDIA A800-SXM4-80GB",
75
+ "memoryTotal": "85198045184",
76
+ "architecture": "Ampere"
77
+ },
78
+ {
79
+ "name": "NVIDIA A800-SXM4-80GB",
80
+ "memoryTotal": "85198045184",
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA A800-SXM4-80GB",
85
+ "memoryTotal": "85198045184",
86
+ "architecture": "Ampere"
87
+ },
88
+ {
89
+ "name": "NVIDIA A800-SXM4-80GB",
90
+ "memoryTotal": "85198045184",
91
+ "architecture": "Ampere"
92
+ },
93
+ {
94
+ "name": "NVIDIA A800-SXM4-80GB",
95
+ "memoryTotal": "85198045184",
96
+ "architecture": "Ampere"
97
+ }
98
+ ],
99
+ "cudaVersion": "12.1"
100
+ }
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":461}}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T20:54:50.857319195+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-core.log"}
2
+ {"time":"2025-06-28T20:54:52.34815986+08:00","level":"INFO","msg":"created new stream","id":"irx8yzsh"}
3
+ {"time":"2025-06-28T20:54:52.348208346+08:00","level":"INFO","msg":"stream: started","id":"irx8yzsh"}
4
+ {"time":"2025-06-28T20:54:52.348232003+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"irx8yzsh"}
5
+ {"time":"2025-06-28T20:54:52.348267908+08:00","level":"INFO","msg":"handler: started","stream_id":"irx8yzsh"}
6
+ {"time":"2025-06-28T20:54:52.348288286+08:00","level":"INFO","msg":"sender: started","stream_id":"irx8yzsh"}
7
+ {"time":"2025-06-28T20:54:53.671617933+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-28T21:02:32.638770807+08:00","level":"INFO","msg":"stream: closing","id":"irx8yzsh"}
9
+ {"time":"2025-06-28T21:02:32.638994707+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-06-28T21:02:32.646934986+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-06-28T21:02:34.307048324+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2025-06-28T21:02:35.437245138+08:00","level":"INFO","msg":"handler: closed","stream_id":"irx8yzsh"}
13
+ {"time":"2025-06-28T21:02:35.437313629+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"irx8yzsh"}
14
+ {"time":"2025-06-28T21:02:35.437394408+08:00","level":"INFO","msg":"sender: closed","stream_id":"irx8yzsh"}
15
+ {"time":"2025-06-28T21:02:35.441861626+08:00","level":"INFO","msg":"stream: closed","id":"irx8yzsh"}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Configure stats pid to 45186
3
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug.log
7
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_205450-irx8yzsh/logs/debug-internal.log
8
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 20:54:50,842 INFO MainThread:45186 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 20:54:50,845 INFO MainThread:45186 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 20:54:50,852 INFO MainThread:45186 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 20:54:50,855 INFO MainThread:45186 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 20:54:50,859 INFO MainThread:45186 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 20:54:53,659 INFO MainThread:45186 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-28 20:54:53,876 INFO MainThread:45186 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-28 20:54:53,876 INFO MainThread:45186 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-28 20:54:53,923 INFO MainThread:45186 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-28 20:54:53,924 INFO MainThread:45186 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-28 20:54:53,930 INFO MainThread:45186 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-28 21:02:32,610 INFO MsgRouterThr:45186 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/config.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": trainer/global_step
6
+ "6":
7
+ - 3
8
+ "7": []
9
+ python_version: 3.10.0
10
+ t:
11
+ "1":
12
+ - 1
13
+ - 5
14
+ - 9
15
+ - 11
16
+ - 33
17
+ - 41
18
+ - 49
19
+ - 53
20
+ - 55
21
+ - 63
22
+ - 103
23
+ "2":
24
+ - 1
25
+ - 5
26
+ - 9
27
+ - 11
28
+ - 33
29
+ - 41
30
+ - 49
31
+ - 53
32
+ - 55
33
+ - 63
34
+ - 103
35
+ "3":
36
+ - 7
37
+ - 23
38
+ - 33
39
+ - 55
40
+ - 66
41
+ "4": 3.10.0
42
+ "5": 0.19.11
43
+ "6": 4.52.3
44
+ "8":
45
+ - 5
46
+ "12": 0.19.11
47
+ "13": linux-x86_64
48
+ accelerator:
49
+ value: gpu
50
+ batch_size:
51
+ value: 168
52
+ bert_hidden_dim:
53
+ value: 768
54
+ bert_name:
55
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
56
+ check_val_every_n_epoch:
57
+ value: 1
58
+ cross_attention_freq:
59
+ value: 2
60
+ devices:
61
+ value: 0,1,2,3,4,5,6,7
62
+ filename:
63
+ value: stage1_ckpt
64
+ init_checkpoint:
65
+ value: ""
66
+ init_lr:
67
+ value: 0.0001
68
+ lm:
69
+ value: true
70
+ load_4bit:
71
+ value: false
72
+ lr_decay_rate:
73
+ value: 0.9
74
+ match_batch_size:
75
+ value: 64
76
+ max_epochs:
77
+ value: 20
78
+ min_lr:
79
+ value: 1e-05
80
+ mix_dataset:
81
+ value: true
82
+ mode:
83
+ value: train
84
+ num_query_token:
85
+ value: 8
86
+ num_workers:
87
+ value: 8
88
+ plm_name:
89
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
90
+ plm_tune:
91
+ value: freeze
92
+ pool_size:
93
+ value: 0
94
+ precision:
95
+ value: bf16-mixed
96
+ projection_dim:
97
+ value: 256
98
+ prot_aug:
99
+ value: None
100
+ prot_max_len:
101
+ value: 1024
102
+ ptm:
103
+ value: true
104
+ rerank_cand_num:
105
+ value: 128
106
+ retrieval_eval_epoch:
107
+ value: 10
108
+ root:
109
+ value: data_small
110
+ save_every_n_epochs:
111
+ value: 5
112
+ scheduler:
113
+ value: linear_warmup_cosine_lr
114
+ seed:
115
+ value: 42
116
+ strategy:
117
+ value: ddp
118
+ temperature:
119
+ value: 0.1
120
+ text_max_len:
121
+ value: 128
122
+ use_wandb_logger:
123
+ value: true
124
+ warmup_lr:
125
+ value: 1e-06
126
+ warmup_steps:
127
+ value: 1000
128
+ weight_decay:
129
+ value: 0.05
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/output.log ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ W0628 21:09:07.551322 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 234629 via signal SIGTERM
2
+ W0628 21:09:07.552175 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 234847 via signal SIGTERM
3
+ W0628 21:09:07.552469 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235007 via signal SIGTERM
4
+ W0628 21:09:07.552683 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235147 via signal SIGTERM
5
+ W0628 21:09:07.552918 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235302 via signal SIGTERM
6
+ W0628 21:09:07.553165 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235447 via signal SIGTERM
7
+ W0628 21:09:07.553355 234012 site-packages/torch/multiprocessing/spawn.py:169] Terminating process 235583 via signal SIGTERM
8
+ Traceback (most recent call last):
9
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 135, in <module>
10
+ main(args)
11
+ File "/nas/shared/kilab/wangyujia/ProtT3/stage1.py", line 101, in main
12
+ trainer.fit(model, datamodule=dm)
13
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in fit
14
+ call._call_and_handle_interrupt(
15
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
16
+ return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
17
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 144, in launch
18
+ while not process_context.join():
19
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 215, in join
20
+ raise ProcessRaisedException(msg, error_index, failed_process.pid)
21
+ torch.multiprocessing.spawn.ProcessRaisedException:
22
+
23
+ -- Process 1 terminated with the following error:
24
+ Traceback (most recent call last):
25
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 90, in _wrap
26
+ fn(i, *args)
27
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 173, in _wrapping_function
28
+ results = function(*args, **kwargs)
29
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 599, in _fit_impl
30
+ self._run(model, ckpt_path=ckpt_path)
31
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _run
32
+ results = self._run_stage()
33
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1056, in _run_stage
34
+ self.fit_loop.run()
35
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run
36
+ self.advance()
37
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 455, in advance
38
+ self.epoch_loop.run(self._data_fetcher)
39
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 150, in run
40
+ self.advance(data_fetcher)
41
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 320, in advance
42
+ batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
43
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 192, in run
44
+ self._optimizer_step(batch_idx, closure)
45
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 270, in _optimizer_step
46
+ call._call_lightning_module_hook(
47
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 176, in _call_lightning_module_hook
48
+ output = fn(*args, **kwargs)
49
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1302, in optimizer_step
50
+ optimizer.step(closure=optimizer_closure)
51
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 154, in step
52
+ step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
53
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py", line 270, in optimizer_step
54
+ optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
55
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 239, in optimizer_step
56
+ return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
57
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/amp.py", line 76, in optimizer_step
58
+ return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs)
59
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 123, in optimizer_step
60
+ return optimizer.step(closure=closure, **kwargs)
61
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/optim/optimizer.py", line 493, in wrapper
62
+ out = func(*args, **kwargs)
63
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
64
+ ret = func(self, *args, **kwargs)
65
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/optim/adamw.py", line 220, in step
66
+ loss = closure()
67
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 109, in _wrap_closure
68
+ closure_result = closure()
69
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 146, in __call__
70
+ self._result = self.closure(*args, **kwargs)
71
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
72
+ return func(*args, **kwargs)
73
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 131, in closure
74
+ step_output = self._step_fn()
75
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 319, in _training_step
76
+ training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
77
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 328, in _call_strategy_hook
78
+ output = fn(*args, **kwargs)
79
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step
80
+ return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
81
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 641, in __call__
82
+ wrapper_output = wrapper_module(*args, **kwargs)
83
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
84
+ return self._call_impl(*args, **kwargs)
85
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
86
+ return forward_call(*args, **kwargs)
87
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1643, in forward
88
+ else self._run_ddp_forward(*inputs, **kwargs)
89
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1459, in _run_ddp_forward
90
+ return self.module(*inputs, **kwargs) # type: ignore[index]
91
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
92
+ return self._call_impl(*args, **kwargs)
93
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
94
+ return forward_call(*args, **kwargs)
95
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 634, in wrapped_forward
96
+ out = method(*_args, **_kwargs)
97
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2_stage1.py", line 184, in training_step
98
+ blip2_loss = self.blip2qformer(batch)
99
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
100
+ return self._call_impl(*args, **kwargs)
101
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
102
+ return forward_call(*args, **kwargs)
103
+ File "/nas/shared/kilab/wangyujia/ProtT3/model/blip2qformer.py", line 228, in forward
104
+ lm_output = self.Qformer(
105
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
106
+ return self._call_impl(*args, **kwargs)
107
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
108
+ return forward_call(*args, **kwargs)
109
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 1046, in forward
110
+ outputs = self.bert(
111
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
112
+ return self._call_impl(*args, **kwargs)
113
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
114
+ return forward_call(*args, **kwargs)
115
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 937, in forward
116
+ encoder_outputs = self.encoder(
117
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
118
+ return self._call_impl(*args, **kwargs)
119
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
120
+ return forward_call(*args, **kwargs)
121
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 550, in forward
122
+ layer_outputs = layer_module(
123
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
124
+ return self._call_impl(*args, **kwargs)
125
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
126
+ return forward_call(*args, **kwargs)
127
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 464, in forward
128
+ layer_output = apply_chunking_to_forward(
129
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/transformers/pytorch_utils.py", line 253, in apply_chunking_to_forward
130
+ return forward_fn(*input_tensors)
131
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 477, in feed_forward_chunk
132
+ intermediate_output = self.intermediate(attention_output)
133
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
134
+ return self._call_impl(*args, **kwargs)
135
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
136
+ return forward_call(*args, **kwargs)
137
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/lavis/models/blip2_models/Qformer.py", line 359, in forward
138
+ hidden_states = self.dense(hidden_states)
139
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
140
+ return self._call_impl(*args, **kwargs)
141
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
142
+ return forward_call(*args, **kwargs)
143
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 125, in forward
144
+ return F.linear(input, self.weight, self.bias)
145
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 252.00 MiB. GPU 1 has a total capacity of 79.35 GiB of which 112.19 MiB is free. Process 1747899 has 79.23 GiB memory in use. Of the allocated memory 77.42 GiB is allocated by PyTorch, and 738.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ marisa-trie==1.2.1
2
+ pydantic==2.11.5
3
+ mdurl==0.1.2
4
+ gitdb==4.0.12
5
+ scikit-image==0.25.2
6
+ async-timeout==5.0.1
7
+ blis==1.3.0
8
+ urllib3==2.4.0
9
+ spacy==3.8.7
10
+ nvidia-ml-py==12.575.51
11
+ braceexpand==0.1.7
12
+ nvidia-cufft-cu12==11.2.1.3
13
+ rich==14.0.0
14
+ setuptools==78.1.1
15
+ matplotlib==3.10.3
16
+ catalogue==2.0.10
17
+ decord==0.6.0
18
+ numpy==2.2.6
19
+ charset-normalizer==3.4.2
20
+ langcodes==3.5.0
21
+ pexpect==4.9.0
22
+ nltk==3.9.1
23
+ cachetools==5.5.2
24
+ cfgv==3.4.0
25
+ prompt_toolkit==3.0.51
26
+ srsly==2.5.1
27
+ einops==0.8.1
28
+ Jinja2==3.1.6
29
+ cloudpathlib==0.21.1
30
+ streamlit==1.45.1
31
+ pydantic_core==2.33.2
32
+ tornado==6.5.1
33
+ nvidia-curand-cu12==10.3.5.147
34
+ deepspeed==0.16.10+b666844f
35
+ networkx==3.4.2
36
+ omegaconf==2.3.0
37
+ msgpack==1.1.0
38
+ pandas==2.2.3
39
+ rouge_score==0.1.2
40
+ six==1.17.0
41
+ language_data==1.3.0
42
+ referencing==0.36.2
43
+ rpds-py==0.25.1
44
+ lazy_loader==0.4
45
+ pydeck==0.9.1
46
+ markdown-it-py==3.0.0
47
+ fonttools==4.58.0
48
+ nvidia-cuda-runtime-cu12==12.4.127
49
+ smart-open==7.1.0
50
+ identify==2.6.12
51
+ pure_eval==0.2.3
52
+ confection==0.1.5
53
+ nvidia-cublas-cu12==12.4.5.8
54
+ nvidia-cusparselt-cu12==0.6.2
55
+ decorator==5.2.1
56
+ nvidia-nccl-cu12==2.21.5
57
+ pytz==2025.2
58
+ nvidia-cudnn-cu12==9.1.0.70
59
+ plotly==6.1.1
60
+ safetensors==0.5.3
61
+ portalocker==3.1.1
62
+ toml==0.10.2
63
+ triton==3.2.0
64
+ cycler==0.12.1
65
+ torch==2.6.0
66
+ python-magic==0.4.27
67
+ ptyprocess==0.7.0
68
+ regex==2024.11.6
69
+ absl-py==2.2.2
70
+ psutil==7.0.0
71
+ murmurhash==1.0.13
72
+ wrapt==1.17.2
73
+ pycocoevalcap==1.2
74
+ python-slugify==8.0.4
75
+ stack-data==0.6.3
76
+ python-dateutil==2.9.0.post0
77
+ scipy==1.15.3
78
+ annotated-types==0.7.0
79
+ mpmath==1.3.0
80
+ ipython==8.36.0
81
+ pyparsing==3.2.3
82
+ nvidia-nvtx-cu12==12.4.127
83
+ fairscale==0.4.4
84
+ jsonschema-specifications==2025.4.1
85
+ matplotlib-inline==0.1.7
86
+ watchdog==6.0.0
87
+ thinc==8.3.6
88
+ antlr4-python3-runtime==4.9.3
89
+ webencodings==0.5.1
90
+ hjson==3.1.0
91
+ propcache==0.3.1
92
+ virtualenv==20.31.2
93
+ pytorch-lightning==2.5.1.post0
94
+ Pygments==2.19.1
95
+ pillow==11.2.1
96
+ joblib==1.5.1
97
+ tqdm==4.67.1
98
+ timm==0.4.12
99
+ nvidia-nvjitlink-cu12==12.4.127
100
+ aiosignal==1.3.2
101
+ kaggle==1.7.4.5
102
+ idna==3.10
103
+ pycocotools==2.0.8
104
+ MarkupSafe==3.0.2
105
+ traitlets==5.14.3
106
+ multidict==6.4.4
107
+ distlib==0.3.9
108
+ torchmetrics==1.7.1
109
+ pyarrow==20.0.0
110
+ tzdata==2025.2
111
+ platformdirs==4.3.8
112
+ yarl==1.20.0
113
+ tenacity==9.1.2
114
+ altair==5.5.0
115
+ wasabi==1.1.3
116
+ attrs==25.3.0
117
+ contourpy==1.3.2
118
+ kiwisolver==1.4.8
119
+ PyYAML==6.0.2
120
+ exceptiongroup==1.3.0
121
+ jedi==0.19.2
122
+ sentencepiece==0.2.0
123
+ nvidia-cusolver-cu12==11.6.1.9
124
+ requests==2.32.3
125
+ opendatasets==0.1.22
126
+ GitPython==3.1.44
127
+ bleach==6.2.0
128
+ protobuf==6.31.0
129
+ sympy==1.13.1
130
+ filelock==3.18.0
131
+ pre_commit==4.2.0
132
+ text-unidecode==1.3
133
+ wheel==0.45.1
134
+ contexttimer==0.3.3
135
+ wcwidth==0.2.13
136
+ spacy-legacy==3.0.12
137
+ aiohappyeyeballs==2.6.1
138
+ imageio==2.37.0
139
+ nodeenv==1.9.1
140
+ py-cpuinfo==9.0.0
141
+ hf-xet==1.1.2
142
+ nvidia-cuda-cupti-cu12==12.4.127
143
+ weasel==0.4.1
144
+ certifi==2025.4.26
145
+ lightning-utilities==0.14.3
146
+ typing_extensions==4.13.2
147
+ typing-inspection==0.4.1
148
+ webdataset==0.2.111
149
+ nvidia-cusparse-cu12==12.3.1.170
150
+ asttokens==3.0.0
151
+ nvidia-cufile-cu12==1.11.1.6
152
+ opencv-python-headless==4.5.5.64
153
+ smmap==5.0.2
154
+ tifffile==2025.5.10
155
+ iopath==0.1.10
156
+ packaging==24.2
157
+ cymem==2.0.11
158
+ spacy-loggers==1.0.5
159
+ ninja==1.11.1.4
160
+ ftfy==6.3.1
161
+ executing==2.2.0
162
+ nvidia-cuda-nvrtc-cu12==12.4.127
163
+ blinker==1.9.0
164
+ torchvision==0.21.0
165
+ parso==0.8.4
166
+ salesforce-lavis==1.0.2
167
+ frozenlist==1.6.0
168
+ shellingham==1.5.4
169
+ flash-attn==2.7.1.post1
170
+ pycparser==2.22
171
+ threadpoolctl==3.6.0
172
+ opencv-python==4.11.0.86
173
+ fsspec==2025.3.0
174
+ aiohttp==3.12.2
175
+ narwhals==1.41.0
176
+ opendelta==0.3.2
177
+ pycryptodome==3.23.0
178
+ crcmod==1.7
179
+ delta-center-client==0.0.4
180
+ tokenizers==0.21.1
181
+ aliyun-python-sdk-kms==2.16.5
182
+ more-itertools==10.7.0
183
+ yacs==0.1.8
184
+ bigmodelvis==0.0.1
185
+ jmespath==0.10.0
186
+ docker-pycreds==0.4.0
187
+ web.py==0.62
188
+ scikit-learn==1.6.1
189
+ pip==25.1.1
190
+ cheroot==10.0.1
191
+ setproctitle==1.3.6
192
+ huggingface-hub==0.32.1
193
+ oss2==2.15.0
194
+ cryptography==45.0.3
195
+ typer==0.16.0
196
+ xxhash==3.5.0
197
+ jsonschema==4.24.0
198
+ click==8.2.1
199
+ preshed==3.0.10
200
+ sentry-sdk==2.29.1
201
+ wandb==0.19.11
202
+ dill==0.3.8
203
+ aliyun-python-sdk-core==2.16.0
204
+ transformers==4.52.3
205
+ cffi==1.17.1
206
+ pathlib==1.0.1
207
+ jaraco.functools==4.1.0
208
+ datasets==3.6.0
209
+ multiprocess==0.70.16
210
+ backports.tarfile==1.2.0
211
+ tomli==2.0.1
212
+ autocommand==2.2.2
213
+ zipp==3.19.2
214
+ jaraco.text==3.12.1
215
+ jaraco.collections==5.1.0
216
+ platformdirs==4.2.2
217
+ typeguard==4.3.0
218
+ typing_extensions==4.12.2
219
+ jaraco.functools==4.0.1
220
+ inflect==7.3.1
221
+ wheel==0.45.1
222
+ more-itertools==10.3.0
223
+ importlib_metadata==8.0.0
224
+ jaraco.context==5.3.0
225
+ packaging==24.2
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-metadata.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T13:03:20.280479Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_ckpt",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "168",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger",
30
+ "--strategy",
31
+ "ddp"
32
+ ],
33
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
34
+ "codePath": "stage1.py",
35
+ "email": "gia0603yucca@gmail.com",
36
+ "root": "./all_checkpoints/stage1_ckpt/",
37
+ "host": "dsw-265304-558499d597-hhhs7",
38
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
39
+ "codePathLocal": "stage1.py",
40
+ "cpu_count": 64,
41
+ "cpu_count_logical": 64,
42
+ "gpu": "NVIDIA A800-SXM4-80GB",
43
+ "gpu_count": 8,
44
+ "disk": {
45
+ "/": {
46
+ "total": "1623302262784",
47
+ "used": "11399290880"
48
+ }
49
+ },
50
+ "memory": {
51
+ "total": "549755813888"
52
+ },
53
+ "cpu": {
54
+ "count": 64,
55
+ "countLogical": 64
56
+ },
57
+ "gpu_nvidia": [
58
+ {
59
+ "name": "NVIDIA A800-SXM4-80GB",
60
+ "memoryTotal": "85198045184",
61
+ "architecture": "Ampere"
62
+ },
63
+ {
64
+ "name": "NVIDIA A800-SXM4-80GB",
65
+ "memoryTotal": "85198045184",
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A800-SXM4-80GB",
70
+ "memoryTotal": "85198045184",
71
+ "architecture": "Ampere"
72
+ },
73
+ {
74
+ "name": "NVIDIA A800-SXM4-80GB",
75
+ "memoryTotal": "85198045184",
76
+ "architecture": "Ampere"
77
+ },
78
+ {
79
+ "name": "NVIDIA A800-SXM4-80GB",
80
+ "memoryTotal": "85198045184",
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA A800-SXM4-80GB",
85
+ "memoryTotal": "85198045184",
86
+ "architecture": "Ampere"
87
+ },
88
+ {
89
+ "name": "NVIDIA A800-SXM4-80GB",
90
+ "memoryTotal": "85198045184",
91
+ "architecture": "Ampere"
92
+ },
93
+ {
94
+ "name": "NVIDIA A800-SXM4-80GB",
95
+ "memoryTotal": "85198045184",
96
+ "architecture": "Ampere"
97
+ }
98
+ ],
99
+ "cudaVersion": "12.1"
100
+ }
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":349}}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T21:03:20.284878435+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-core.log"}
2
+ {"time":"2025-06-28T21:03:21.431223505+08:00","level":"INFO","msg":"created new stream","id":"rrhzb5iq"}
3
+ {"time":"2025-06-28T21:03:21.431273053+08:00","level":"INFO","msg":"stream: started","id":"rrhzb5iq"}
4
+ {"time":"2025-06-28T21:03:21.431320218+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"rrhzb5iq"}
5
+ {"time":"2025-06-28T21:03:21.431348318+08:00","level":"INFO","msg":"handler: started","stream_id":"rrhzb5iq"}
6
+ {"time":"2025-06-28T21:03:21.431364756+08:00","level":"INFO","msg":"sender: started","stream_id":"rrhzb5iq"}
7
+ {"time":"2025-06-28T21:03:22.981144572+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-28T21:09:09.647087317+08:00","level":"INFO","msg":"stream: closing","id":"rrhzb5iq"}
9
+ {"time":"2025-06-28T21:09:09.647148642+08:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2025-06-28T21:09:09.648142179+08:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2025-06-28T21:09:11.22982496+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2025-06-28T21:09:12.511227827+08:00","level":"INFO","msg":"handler: closed","stream_id":"rrhzb5iq"}
13
+ {"time":"2025-06-28T21:09:12.51128659+08:00","level":"INFO","msg":"sender: closed","stream_id":"rrhzb5iq"}
14
+ {"time":"2025-06-28T21:09:12.511284269+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"rrhzb5iq"}
15
+ {"time":"2025-06-28T21:09:12.519090125+08:00","level":"INFO","msg":"stream: closed","id":"rrhzb5iq"}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Configure stats pid to 234012
3
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug.log
7
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/logs/debug-internal.log
8
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 21:03:20,274 INFO MainThread:234012 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 21:03:20,276 INFO MainThread:234012 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 21:03:20,280 INFO MainThread:234012 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 21:03:20,281 INFO MainThread:234012 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 21:03:20,281 INFO MainThread:234012 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 21:03:22,930 INFO MainThread:234012 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-28 21:03:23,114 INFO MainThread:234012 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-28 21:03:23,114 INFO MainThread:234012 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-28 21:03:23,118 INFO MainThread:234012 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-28 21:03:23,118 INFO MainThread:234012 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-28 21:03:23,120 INFO MainThread:234012 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-28 21:09:09,645 INFO MsgRouterThr:234012 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_210320-rrhzb5iq/run-rrhzb5iq.wandb ADDED
Binary file (95.4 kB). View file
 
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/output.log ADDED
File without changes
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ marisa-trie==1.2.1
2
+ pydantic==2.11.5
3
+ mdurl==0.1.2
4
+ gitdb==4.0.12
5
+ scikit-image==0.25.2
6
+ async-timeout==5.0.1
7
+ blis==1.3.0
8
+ urllib3==2.4.0
9
+ spacy==3.8.7
10
+ nvidia-ml-py==12.575.51
11
+ braceexpand==0.1.7
12
+ nvidia-cufft-cu12==11.2.1.3
13
+ rich==14.0.0
14
+ setuptools==78.1.1
15
+ matplotlib==3.10.3
16
+ catalogue==2.0.10
17
+ decord==0.6.0
18
+ numpy==2.2.6
19
+ charset-normalizer==3.4.2
20
+ langcodes==3.5.0
21
+ pexpect==4.9.0
22
+ nltk==3.9.1
23
+ cachetools==5.5.2
24
+ cfgv==3.4.0
25
+ prompt_toolkit==3.0.51
26
+ srsly==2.5.1
27
+ einops==0.8.1
28
+ Jinja2==3.1.6
29
+ cloudpathlib==0.21.1
30
+ streamlit==1.45.1
31
+ pydantic_core==2.33.2
32
+ tornado==6.5.1
33
+ nvidia-curand-cu12==10.3.5.147
34
+ deepspeed==0.16.10+b666844f
35
+ networkx==3.4.2
36
+ omegaconf==2.3.0
37
+ msgpack==1.1.0
38
+ pandas==2.2.3
39
+ rouge_score==0.1.2
40
+ six==1.17.0
41
+ language_data==1.3.0
42
+ referencing==0.36.2
43
+ rpds-py==0.25.1
44
+ lazy_loader==0.4
45
+ pydeck==0.9.1
46
+ markdown-it-py==3.0.0
47
+ fonttools==4.58.0
48
+ nvidia-cuda-runtime-cu12==12.4.127
49
+ smart-open==7.1.0
50
+ identify==2.6.12
51
+ pure_eval==0.2.3
52
+ confection==0.1.5
53
+ nvidia-cublas-cu12==12.4.5.8
54
+ nvidia-cusparselt-cu12==0.6.2
55
+ decorator==5.2.1
56
+ nvidia-nccl-cu12==2.21.5
57
+ pytz==2025.2
58
+ nvidia-cudnn-cu12==9.1.0.70
59
+ plotly==6.1.1
60
+ safetensors==0.5.3
61
+ portalocker==3.1.1
62
+ toml==0.10.2
63
+ triton==3.2.0
64
+ cycler==0.12.1
65
+ torch==2.6.0
66
+ python-magic==0.4.27
67
+ ptyprocess==0.7.0
68
+ regex==2024.11.6
69
+ absl-py==2.2.2
70
+ psutil==7.0.0
71
+ murmurhash==1.0.13
72
+ wrapt==1.17.2
73
+ pycocoevalcap==1.2
74
+ python-slugify==8.0.4
75
+ stack-data==0.6.3
76
+ python-dateutil==2.9.0.post0
77
+ scipy==1.15.3
78
+ annotated-types==0.7.0
79
+ mpmath==1.3.0
80
+ ipython==8.36.0
81
+ pyparsing==3.2.3
82
+ nvidia-nvtx-cu12==12.4.127
83
+ fairscale==0.4.4
84
+ jsonschema-specifications==2025.4.1
85
+ matplotlib-inline==0.1.7
86
+ watchdog==6.0.0
87
+ thinc==8.3.6
88
+ antlr4-python3-runtime==4.9.3
89
+ webencodings==0.5.1
90
+ hjson==3.1.0
91
+ propcache==0.3.1
92
+ virtualenv==20.31.2
93
+ pytorch-lightning==2.5.1.post0
94
+ Pygments==2.19.1
95
+ pillow==11.2.1
96
+ joblib==1.5.1
97
+ tqdm==4.67.1
98
+ timm==0.4.12
99
+ nvidia-nvjitlink-cu12==12.4.127
100
+ aiosignal==1.3.2
101
+ kaggle==1.7.4.5
102
+ idna==3.10
103
+ pycocotools==2.0.8
104
+ MarkupSafe==3.0.2
105
+ traitlets==5.14.3
106
+ multidict==6.4.4
107
+ distlib==0.3.9
108
+ torchmetrics==1.7.1
109
+ pyarrow==20.0.0
110
+ tzdata==2025.2
111
+ platformdirs==4.3.8
112
+ yarl==1.20.0
113
+ tenacity==9.1.2
114
+ altair==5.5.0
115
+ wasabi==1.1.3
116
+ attrs==25.3.0
117
+ contourpy==1.3.2
118
+ kiwisolver==1.4.8
119
+ PyYAML==6.0.2
120
+ exceptiongroup==1.3.0
121
+ jedi==0.19.2
122
+ sentencepiece==0.2.0
123
+ nvidia-cusolver-cu12==11.6.1.9
124
+ requests==2.32.3
125
+ opendatasets==0.1.22
126
+ GitPython==3.1.44
127
+ bleach==6.2.0
128
+ protobuf==6.31.0
129
+ sympy==1.13.1
130
+ filelock==3.18.0
131
+ pre_commit==4.2.0
132
+ text-unidecode==1.3
133
+ wheel==0.45.1
134
+ contexttimer==0.3.3
135
+ wcwidth==0.2.13
136
+ spacy-legacy==3.0.12
137
+ aiohappyeyeballs==2.6.1
138
+ imageio==2.37.0
139
+ nodeenv==1.9.1
140
+ py-cpuinfo==9.0.0
141
+ hf-xet==1.1.2
142
+ nvidia-cuda-cupti-cu12==12.4.127
143
+ weasel==0.4.1
144
+ certifi==2025.4.26
145
+ lightning-utilities==0.14.3
146
+ typing_extensions==4.13.2
147
+ typing-inspection==0.4.1
148
+ webdataset==0.2.111
149
+ nvidia-cusparse-cu12==12.3.1.170
150
+ asttokens==3.0.0
151
+ nvidia-cufile-cu12==1.11.1.6
152
+ opencv-python-headless==4.5.5.64
153
+ smmap==5.0.2
154
+ tifffile==2025.5.10
155
+ iopath==0.1.10
156
+ packaging==24.2
157
+ cymem==2.0.11
158
+ spacy-loggers==1.0.5
159
+ ninja==1.11.1.4
160
+ ftfy==6.3.1
161
+ executing==2.2.0
162
+ nvidia-cuda-nvrtc-cu12==12.4.127
163
+ blinker==1.9.0
164
+ torchvision==0.21.0
165
+ parso==0.8.4
166
+ salesforce-lavis==1.0.2
167
+ frozenlist==1.6.0
168
+ shellingham==1.5.4
169
+ flash-attn==2.7.1.post1
170
+ pycparser==2.22
171
+ threadpoolctl==3.6.0
172
+ opencv-python==4.11.0.86
173
+ fsspec==2025.3.0
174
+ aiohttp==3.12.2
175
+ narwhals==1.41.0
176
+ opendelta==0.3.2
177
+ pycryptodome==3.23.0
178
+ crcmod==1.7
179
+ delta-center-client==0.0.4
180
+ tokenizers==0.21.1
181
+ aliyun-python-sdk-kms==2.16.5
182
+ more-itertools==10.7.0
183
+ yacs==0.1.8
184
+ bigmodelvis==0.0.1
185
+ jmespath==0.10.0
186
+ docker-pycreds==0.4.0
187
+ web.py==0.62
188
+ scikit-learn==1.6.1
189
+ pip==25.1.1
190
+ cheroot==10.0.1
191
+ setproctitle==1.3.6
192
+ huggingface-hub==0.32.1
193
+ oss2==2.15.0
194
+ cryptography==45.0.3
195
+ typer==0.16.0
196
+ xxhash==3.5.0
197
+ jsonschema==4.24.0
198
+ click==8.2.1
199
+ preshed==3.0.10
200
+ sentry-sdk==2.29.1
201
+ wandb==0.19.11
202
+ dill==0.3.8
203
+ aliyun-python-sdk-core==2.16.0
204
+ transformers==4.52.3
205
+ cffi==1.17.1
206
+ pathlib==1.0.1
207
+ jaraco.functools==4.1.0
208
+ datasets==3.6.0
209
+ multiprocess==0.70.16
210
+ backports.tarfile==1.2.0
211
+ tomli==2.0.1
212
+ autocommand==2.2.2
213
+ zipp==3.19.2
214
+ jaraco.text==3.12.1
215
+ jaraco.collections==5.1.0
216
+ platformdirs==4.2.2
217
+ typeguard==4.3.0
218
+ typing_extensions==4.12.2
219
+ jaraco.functools==4.0.1
220
+ inflect==7.3.1
221
+ wheel==0.45.1
222
+ more-itertools==10.3.0
223
+ importlib_metadata==8.0.0
224
+ jaraco.context==5.3.0
225
+ packaging==24.2
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/files/wandb-metadata.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T13:10:12.808868Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_ckpt",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "64",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger",
30
+ "--strategy",
31
+ "ddp"
32
+ ],
33
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
34
+ "codePath": "stage1.py",
35
+ "email": "gia0603yucca@gmail.com",
36
+ "root": "./all_checkpoints/stage1_ckpt/",
37
+ "host": "dsw-265304-558499d597-hhhs7",
38
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
39
+ "codePathLocal": "stage1.py",
40
+ "cpu_count": 64,
41
+ "cpu_count_logical": 64,
42
+ "gpu": "NVIDIA A800-SXM4-80GB",
43
+ "gpu_count": 8,
44
+ "disk": {
45
+ "/": {
46
+ "total": "1623302262784",
47
+ "used": "11399299072"
48
+ }
49
+ },
50
+ "memory": {
51
+ "total": "549755813888"
52
+ },
53
+ "cpu": {
54
+ "count": 64,
55
+ "countLogical": 64
56
+ },
57
+ "gpu_nvidia": [
58
+ {
59
+ "name": "NVIDIA A800-SXM4-80GB",
60
+ "memoryTotal": "85198045184",
61
+ "architecture": "Ampere"
62
+ },
63
+ {
64
+ "name": "NVIDIA A800-SXM4-80GB",
65
+ "memoryTotal": "85198045184",
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A800-SXM4-80GB",
70
+ "memoryTotal": "85198045184",
71
+ "architecture": "Ampere"
72
+ },
73
+ {
74
+ "name": "NVIDIA A800-SXM4-80GB",
75
+ "memoryTotal": "85198045184",
76
+ "architecture": "Ampere"
77
+ },
78
+ {
79
+ "name": "NVIDIA A800-SXM4-80GB",
80
+ "memoryTotal": "85198045184",
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA A800-SXM4-80GB",
85
+ "memoryTotal": "85198045184",
86
+ "architecture": "Ampere"
87
+ },
88
+ {
89
+ "name": "NVIDIA A800-SXM4-80GB",
90
+ "memoryTotal": "85198045184",
91
+ "architecture": "Ampere"
92
+ },
93
+ {
94
+ "name": "NVIDIA A800-SXM4-80GB",
95
+ "memoryTotal": "85198045184",
96
+ "architecture": "Ampere"
97
+ }
98
+ ],
99
+ "cudaVersion": "12.1"
100
+ }
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T21:10:12.812155856+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-core.log"}
2
+ {"time":"2025-06-28T21:10:14.053332929+08:00","level":"INFO","msg":"created new stream","id":"qflz8r5n"}
3
+ {"time":"2025-06-28T21:10:14.053381793+08:00","level":"INFO","msg":"stream: started","id":"qflz8r5n"}
4
+ {"time":"2025-06-28T21:10:14.053445291+08:00","level":"INFO","msg":"sender: started","stream_id":"qflz8r5n"}
5
+ {"time":"2025-06-28T21:10:14.053438231+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"qflz8r5n"}
6
+ {"time":"2025-06-28T21:10:14.053511614+08:00","level":"INFO","msg":"handler: started","stream_id":"qflz8r5n"}
7
+ {"time":"2025-06-28T21:10:15.239272048+08:00","level":"INFO","msg":"Starting system monitor"}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Configure stats pid to 407544
3
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug.log
7
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/logs/debug-internal.log
8
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 21:10:12,802 INFO MainThread:407544 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 21:10:12,804 INFO MainThread:407544 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 21:10:12,806 INFO MainThread:407544 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 21:10:12,809 INFO MainThread:407544 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 21:10:12,813 INFO MainThread:407544 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 21:10:15,229 INFO MainThread:407544 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-28 21:10:15,370 INFO MainThread:407544 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-28 21:10:15,370 INFO MainThread:407544 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-28 21:10:15,373 INFO MainThread:407544 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-28 21:10:15,373 INFO MainThread:407544 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-28 21:10:15,375 INFO MainThread:407544 [wandb_init.py:init():1150] run started, returning control to user process
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_211012-qflz8r5n/run-qflz8r5n.wandb ADDED
Binary file (98.3 kB). View file
 
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-internal.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"time":"2025-06-28T22:19:45.153786975+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-core.log"}
2
+ {"time":"2025-06-28T22:20:15.259922664+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
3
+ {"time":"2025-06-28T22:20:47.297526211+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
4
+ {"time":"2025-06-28T22:21:21.358317867+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
5
+ {"time":"2025-06-28T22:22:00.174480175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug.log ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Configure stats pid to 2313
3
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug.log
7
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_221945-g3zjvi79/logs/debug-internal.log
8
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 22:19:45,142 INFO MainThread:2313 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 22:19:45,145 INFO MainThread:2313 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 22:19:45,146 INFO MainThread:2313 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 22:19:45,153 INFO MainThread:2313 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 22:19:45,155 INFO MainThread:2313 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 22:20:16,340 INFO Thread-3 (wrapped_target):2313 [retry.py:__call__():175] [no run ID] Retry attempt failed:
18
+ Traceback (most recent call last):
19
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 198, in _new_conn
20
+ sock = connection.create_connection(
21
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
22
+ raise err
23
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
24
+ sock.connect(sa)
25
+ TimeoutError: timed out
26
+
27
+ The above exception was the direct cause of the following exception:
28
+
29
+ Traceback (most recent call last):
30
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
31
+ response = self._make_request(
32
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 488, in _make_request
33
+ raise new_e
34
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 464, in _make_request
35
+ self._validate_conn(conn)
36
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1093, in _validate_conn
37
+ conn.connect()
38
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 704, in connect
39
+ self.sock = sock = self._new_conn()
40
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connection.py", line 207, in _new_conn
41
+ raise ConnectTimeoutError(
42
+ urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f7346272530>, 'Connection to api.wandb.ai timed out. (connect timeout=20)')
43
+
44
+ The above exception was the direct cause of the following exception:
45
+
46
+ Traceback (most recent call last):
47
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
48
+ resp = conn.urlopen(
49
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen
50
+ retries = retries.increment(
51
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
52
+ raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
53
+ urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f7346272530>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
54
+
55
+ During handling of the above exception, another exception occurred:
56
+
57
+ Traceback (most recent call last):
58
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/retry.py", line 134, in __call__
59
+ result = self._call_fn(*args, **kwargs)
60
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/internal/internal_api.py", line 398, in execute
61
+ return self.client.execute(*args, **kwargs) # type: ignore
62
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 52, in execute
63
+ result = self._get_result(document, *args, **kwargs)
64
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/vendor/gql-0.2.0/wandb_gql/client.py", line 60, in _get_result
65
+ return self.transport.execute(document, *args, **kwargs)
66
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/gql_request.py", line 58, in execute
67
+ request = self.session.post(self.url, **post_args)
68
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 637, in post
69
+ return self.request("POST", url, data=data, json=json, **kwargs)
70
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
71
+ resp = self.send(prep, **send_kwargs)
72
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
73
+ r = adapter.send(request, **kwargs)
74
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/requests/adapters.py", line 688, in send
75
+ raise ConnectTimeout(e, request=request)
76
+ requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.wandb.ai', port=443): Max retries exceeded with url: /graphql (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f7346272530>, 'Connection to api.wandb.ai timed out. (connect timeout=20)'))
77
+ 2025-06-28 22:20:20,942 WARNING MainThread:2313 [wandb_init.py:init():1681] [no run ID] interrupted
78
+ Traceback (most recent call last):
79
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1677, in init
80
+ return wi.init(run_settings, run_config, run_printer)
81
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1055, in init
82
+ result = wait_with_progress(
83
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
84
+ return wait_all_with_progress(
85
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
86
+ return asyncio_compat.run(progress_loop_with_timeout)
87
+ File "/root/miniconda3/envs/protT3/lib/python3.10/site-packages/wandb/sdk/lib/asyncio_compat.py", line 30, in run
88
+ return future.result()
89
+ File "/root/miniconda3/envs/protT3/lib/python3.10/concurrent/futures/_base.py", line 440, in result
90
+ self._condition.wait(timeout)
91
+ File "/root/miniconda3/envs/protT3/lib/python3.10/threading.py", line 320, in wait
92
+ waiter.acquire()
93
+ KeyboardInterrupt
94
+ 2025-06-28 22:20:21,092 INFO MsgRouterThr:2313 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/config.yaml ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": onto_test_fullset_p2t_rec20
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": trainer/global_step
12
+ "6":
13
+ - 3
14
+ "7": []
15
+ - "1": loader2/val_loss_lm/dataloader_idx_2
16
+ "5": 2
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": epoch
22
+ "5": 2
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": lr
28
+ "5": 2
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": swiss_test_rerank_fullset_p2t_rec20
34
+ "5": 2
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": swiss_test_inbatch_t2p_acc
40
+ "5": 2
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": onto_test_inbatch_p2t_acc
46
+ "5": 2
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": swiss_test_rerank_inbatch_t2p_acc
52
+ "5": 2
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": swiss_test_rerank_inbatch_p2t_rec20
58
+ "5": 2
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": loader2/val_loss/dataloader_idx_2
64
+ "5": 2
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": loader1/val_loss_ptc/dataloader_idx_1
70
+ "5": 2
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": loader0/val_loss_lm/dataloader_idx_0
76
+ "5": 2
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ - "1": loader1/val_loss_lm/dataloader_idx_1
82
+ "5": 2
83
+ "6":
84
+ - 1
85
+ - 3
86
+ "7": []
87
+ - "1": swiss_test_rerank_fullset_t2p_acc
88
+ "5": 2
89
+ "6":
90
+ - 1
91
+ - 3
92
+ "7": []
93
+ - "1": swiss_test_rerank_fullset_t2p_rec20
94
+ "5": 2
95
+ "6":
96
+ - 1
97
+ - 3
98
+ "7": []
99
+ - "1": onto_test_rerank_inbatch_p2t_rec20
100
+ "5": 2
101
+ "6":
102
+ - 1
103
+ - 3
104
+ "7": []
105
+ - "1": swiss_test_fullset_t2p_rec20
106
+ "5": 2
107
+ "6":
108
+ - 1
109
+ - 3
110
+ "7": []
111
+ - "1": onto_test_inbatch_p2t_rec20
112
+ "5": 2
113
+ "6":
114
+ - 1
115
+ - 3
116
+ "7": []
117
+ - "1": onto_test_fullset_t2p_acc
118
+ "5": 2
119
+ "6":
120
+ - 1
121
+ - 3
122
+ "7": []
123
+ - "1": onto_test_rerank_inbatch_t2p_acc
124
+ "5": 2
125
+ "6":
126
+ - 1
127
+ - 3
128
+ "7": []
129
+ - "1": loader2/val_loss_ptc/dataloader_idx_2
130
+ "5": 2
131
+ "6":
132
+ - 1
133
+ - 3
134
+ "7": []
135
+ - "1": swiss_test_rerank_inbatch_p2t_acc
136
+ "5": 2
137
+ "6":
138
+ - 1
139
+ - 3
140
+ "7": []
141
+ - "1": onto_test_fullset_p2t_acc
142
+ "5": 2
143
+ "6":
144
+ - 1
145
+ - 3
146
+ "7": []
147
+ - "1": train_loss_ptm
148
+ "5": 2
149
+ "6":
150
+ - 1
151
+ - 3
152
+ "7": []
153
+ - "1": onto_test_rerank_inbatch_t2p_rec20
154
+ "5": 2
155
+ "6":
156
+ - 1
157
+ - 3
158
+ "7": []
159
+ - "1": swiss_test_inbatch_p2t_rec20
160
+ "5": 2
161
+ "6":
162
+ - 1
163
+ - 3
164
+ "7": []
165
+ - "1": onto_test_rerank_fullset_t2p_rec20
166
+ "5": 2
167
+ "6":
168
+ - 1
169
+ - 3
170
+ "7": []
171
+ - "1": loader0/val_loss_ptm/dataloader_idx_0
172
+ "5": 2
173
+ "6":
174
+ - 1
175
+ - 3
176
+ "7": []
177
+ - "1": loader0/val_loss_ptc/dataloader_idx_0
178
+ "5": 2
179
+ "6":
180
+ - 1
181
+ - 3
182
+ "7": []
183
+ - "1": train_loss_ptc
184
+ "5": 2
185
+ "6":
186
+ - 1
187
+ - 3
188
+ "7": []
189
+ - "1": swiss_test_rerank_fullset_p2t_acc
190
+ "5": 2
191
+ "6":
192
+ - 1
193
+ - 3
194
+ "7": []
195
+ - "1": onto_test_rerank_fullset_p2t_rec20
196
+ "5": 2
197
+ "6":
198
+ - 1
199
+ - 3
200
+ "7": []
201
+ - "1": onto_test_rerank_fullset_p2t_acc
202
+ "5": 2
203
+ "6":
204
+ - 1
205
+ - 3
206
+ "7": []
207
+ - "1": onto_test_rerank_inbatch_p2t_acc
208
+ "5": 2
209
+ "6":
210
+ - 1
211
+ - 3
212
+ "7": []
213
+ - "1": swiss_test_inbatch_p2t_acc
214
+ "5": 2
215
+ "6":
216
+ - 1
217
+ - 3
218
+ "7": []
219
+ - "1": loader1/val_loss_ptm/dataloader_idx_1
220
+ "5": 2
221
+ "6":
222
+ - 1
223
+ - 3
224
+ "7": []
225
+ - "1": loader0/val_loss/dataloader_idx_0
226
+ "5": 2
227
+ "6":
228
+ - 1
229
+ - 3
230
+ "7": []
231
+ - "1": train_loss
232
+ "5": 2
233
+ "6":
234
+ - 1
235
+ - 3
236
+ "7": []
237
+ - "1": train_loss_lm
238
+ "5": 2
239
+ "6":
240
+ - 1
241
+ - 3
242
+ "7": []
243
+ - "1": onto_test_rerank_fullset_t2p_acc
244
+ "5": 2
245
+ "6":
246
+ - 1
247
+ - 3
248
+ "7": []
249
+ - "1": onto_test_fullset_t2p_rec20
250
+ "5": 2
251
+ "6":
252
+ - 1
253
+ - 3
254
+ "7": []
255
+ - "1": swiss_test_inbatch_t2p_rec20
256
+ "5": 2
257
+ "6":
258
+ - 1
259
+ - 3
260
+ "7": []
261
+ - "1": swiss_test_rerank_inbatch_t2p_rec20
262
+ "5": 2
263
+ "6":
264
+ - 1
265
+ - 3
266
+ "7": []
267
+ - "1": loader1/val_loss/dataloader_idx_1
268
+ "5": 2
269
+ "6":
270
+ - 1
271
+ - 3
272
+ "7": []
273
+ - "1": loader2/val_loss_ptm/dataloader_idx_2
274
+ "5": 2
275
+ "6":
276
+ - 1
277
+ - 3
278
+ "7": []
279
+ - "1": onto_test_inbatch_t2p_rec20
280
+ "5": 2
281
+ "6":
282
+ - 1
283
+ - 3
284
+ "7": []
285
+ - "1": swiss_test_fullset_p2t_acc
286
+ "5": 2
287
+ "6":
288
+ - 1
289
+ - 3
290
+ "7": []
291
+ - "1": swiss_test_fullset_t2p_acc
292
+ "5": 2
293
+ "6":
294
+ - 1
295
+ - 3
296
+ "7": []
297
+ - "1": swiss_test_fullset_p2t_rec20
298
+ "5": 2
299
+ "6":
300
+ - 1
301
+ - 3
302
+ "7": []
303
+ - "1": onto_test_inbatch_t2p_acc
304
+ "5": 2
305
+ "6":
306
+ - 1
307
+ - 3
308
+ "7": []
309
+ python_version: 3.10.0
310
+ t:
311
+ "1":
312
+ - 1
313
+ - 5
314
+ - 9
315
+ - 11
316
+ - 33
317
+ - 41
318
+ - 49
319
+ - 53
320
+ - 55
321
+ - 63
322
+ - 103
323
+ "2":
324
+ - 1
325
+ - 5
326
+ - 9
327
+ - 11
328
+ - 33
329
+ - 41
330
+ - 49
331
+ - 53
332
+ - 55
333
+ - 63
334
+ - 103
335
+ "3":
336
+ - 7
337
+ - 23
338
+ - 33
339
+ - 55
340
+ - 66
341
+ "4": 3.10.0
342
+ "5": 0.19.11
343
+ "6": 4.52.3
344
+ "8":
345
+ - 5
346
+ "12": 0.19.11
347
+ "13": linux-x86_64
348
+ accelerator:
349
+ value: gpu
350
+ batch_size:
351
+ value: 96
352
+ bert_hidden_dim:
353
+ value: 768
354
+ bert_name:
355
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
356
+ check_val_every_n_epoch:
357
+ value: 1
358
+ cross_attention_freq:
359
+ value: 2
360
+ devices:
361
+ value: 0,1,2,3,4,5,6,7
362
+ filename:
363
+ value: stage1_ckpt
364
+ init_checkpoint:
365
+ value: ""
366
+ init_lr:
367
+ value: 0.0001
368
+ lm:
369
+ value: true
370
+ load_4bit:
371
+ value: false
372
+ lr_decay_rate:
373
+ value: 0.9
374
+ match_batch_size:
375
+ value: 64
376
+ max_epochs:
377
+ value: 20
378
+ min_lr:
379
+ value: 1e-05
380
+ mix_dataset:
381
+ value: true
382
+ mode:
383
+ value: train
384
+ num_query_token:
385
+ value: 8
386
+ num_workers:
387
+ value: 8
388
+ plm_name:
389
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
390
+ plm_tune:
391
+ value: freeze
392
+ pool_size:
393
+ value: 0
394
+ precision:
395
+ value: bf16-mixed
396
+ projection_dim:
397
+ value: 256
398
+ prot_aug:
399
+ value: None
400
+ prot_max_len:
401
+ value: 1024
402
+ ptm:
403
+ value: true
404
+ rerank_cand_num:
405
+ value: 128
406
+ retrieval_eval_epoch:
407
+ value: 10
408
+ root:
409
+ value: data_small
410
+ save_every_n_epochs:
411
+ value: 5
412
+ scheduler:
413
+ value: linear_warmup_cosine_lr
414
+ seed:
415
+ value: 42
416
+ strategy:
417
+ value: ddp
418
+ temperature:
419
+ value: 0.1
420
+ text_max_len:
421
+ value: 128
422
+ use_wandb_logger:
423
+ value: true
424
+ warmup_lr:
425
+ value: 1e-06
426
+ warmup_steps:
427
+ value: 1000
428
+ weight_decay:
429
+ value: 0.05
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/output.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ Detected KeyboardInterrupt, attempting graceful shutdown ...
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ stack-data==0.6.3
2
+ yarl==1.20.0
3
+ setuptools==78.1.1
4
+ cloudpathlib==0.21.1
5
+ pytz==2025.2
6
+ nvidia-cufile-cu12==1.11.1.6
7
+ shellingham==1.5.4
8
+ nvidia-cusolver-cu12==11.6.1.9
9
+ Jinja2==3.1.6
10
+ pycocotools==2.0.8
11
+ pandas==2.2.3
12
+ scipy==1.15.3
13
+ tenacity==9.1.2
14
+ lightning-utilities==0.14.3
15
+ cfgv==3.4.0
16
+ hf-xet==1.1.2
17
+ platformdirs==4.3.8
18
+ smart-open==7.1.0
19
+ text-unidecode==1.3
20
+ nvidia-cublas-cu12==12.4.5.8
21
+ omegaconf==2.3.0
22
+ referencing==0.36.2
23
+ mdurl==0.1.2
24
+ gitdb==4.0.12
25
+ identify==2.6.12
26
+ ipython==8.36.0
27
+ spacy-loggers==1.0.5
28
+ distlib==0.3.9
29
+ typing-inspection==0.4.1
30
+ antlr4-python3-runtime==4.9.3
31
+ multidict==6.4.4
32
+ nvidia-curand-cu12==10.3.5.147
33
+ prompt_toolkit==3.0.51
34
+ Pygments==2.19.1
35
+ numpy==2.2.6
36
+ decord==0.6.0
37
+ srsly==2.5.1
38
+ watchdog==6.0.0
39
+ pure_eval==0.2.3
40
+ virtualenv==20.31.2
41
+ altair==5.5.0
42
+ matplotlib-inline==0.1.7
43
+ bleach==6.2.0
44
+ exceptiongroup==1.3.0
45
+ fairscale==0.4.4
46
+ confection==0.1.5
47
+ fonttools==4.58.0
48
+ nvidia-cuda-nvrtc-cu12==12.4.127
49
+ ptyprocess==0.7.0
50
+ pytorch-lightning==2.5.1.post0
51
+ nodeenv==1.9.1
52
+ nvidia-cudnn-cu12==9.1.0.70
53
+ requests==2.32.3
54
+ marisa-trie==1.2.1
55
+ cachetools==5.5.2
56
+ matplotlib==3.10.3
57
+ typing_extensions==4.13.2
58
+ asttokens==3.0.0
59
+ torch==2.6.0
60
+ PyYAML==6.0.2
61
+ tifffile==2025.5.10
62
+ spacy==3.8.7
63
+ braceexpand==0.1.7
64
+ plotly==6.1.1
65
+ attrs==25.3.0
66
+ py-cpuinfo==9.0.0
67
+ frozenlist==1.6.0
68
+ catalogue==2.0.10
69
+ nvidia-cusparselt-cu12==0.6.2
70
+ traitlets==5.14.3
71
+ annotated-types==0.7.0
72
+ language_data==1.3.0
73
+ thinc==8.3.6
74
+ imageio==2.37.0
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ certifi==2025.4.26
77
+ smmap==5.0.2
78
+ python-magic==0.4.27
79
+ triton==3.2.0
80
+ weasel==0.4.1
81
+ async-timeout==5.0.1
82
+ wcwidth==0.2.13
83
+ pillow==11.2.1
84
+ torchmetrics==1.7.1
85
+ kaggle==1.7.4.5
86
+ regex==2024.11.6
87
+ aiosignal==1.3.2
88
+ nvidia-cusparse-cu12==12.3.1.170
89
+ scikit-image==0.25.2
90
+ nvidia-nvtx-cu12==12.4.127
91
+ opendatasets==0.1.22
92
+ iopath==0.1.10
93
+ pyparsing==3.2.3
94
+ portalocker==3.1.1
95
+ executing==2.2.0
96
+ contexttimer==0.3.3
97
+ lazy_loader==0.4
98
+ wrapt==1.17.2
99
+ webdataset==0.2.111
100
+ blis==1.3.0
101
+ idna==3.10
102
+ timm==0.4.12
103
+ einops==0.8.1
104
+ packaging==24.2
105
+ decorator==5.2.1
106
+ filelock==3.18.0
107
+ python-slugify==8.0.4
108
+ cycler==0.12.1
109
+ charset-normalizer==3.4.2
110
+ pydantic==2.11.5
111
+ pydeck==0.9.1
112
+ tzdata==2025.2
113
+ jedi==0.19.2
114
+ aiohappyeyeballs==2.6.1
115
+ nvidia-nvjitlink-cu12==12.4.127
116
+ salesforce-lavis==1.0.2
117
+ parso==0.8.4
118
+ nvidia-nccl-cu12==2.21.5
119
+ toml==0.10.2
120
+ python-dateutil==2.9.0.post0
121
+ rich==14.0.0
122
+ tqdm==4.67.1
123
+ rpds-py==0.25.1
124
+ opencv-python-headless==4.5.5.64
125
+ tornado==6.5.1
126
+ propcache==0.3.1
127
+ webencodings==0.5.1
128
+ murmurhash==1.0.13
129
+ contourpy==1.3.2
130
+ joblib==1.5.1
131
+ networkx==3.4.2
132
+ six==1.17.0
133
+ markdown-it-py==3.0.0
134
+ nvidia-cuda-cupti-cu12==12.4.127
135
+ msgpack==1.1.0
136
+ sentencepiece==0.2.0
137
+ cymem==2.0.11
138
+ nvidia-cufft-cu12==11.2.1.3
139
+ absl-py==2.2.2
140
+ hjson==3.1.0
141
+ mpmath==1.3.0
142
+ pydantic_core==2.33.2
143
+ psutil==7.0.0
144
+ nvidia-ml-py==12.575.51
145
+ pyarrow==20.0.0
146
+ kiwisolver==1.4.8
147
+ sympy==1.13.1
148
+ ninja==1.11.1.4
149
+ rouge_score==0.1.2
150
+ deepspeed==0.16.10+b666844f
151
+ spacy-legacy==3.0.12
152
+ pycocoevalcap==1.2
153
+ pexpect==4.9.0
154
+ ftfy==6.3.1
155
+ protobuf==6.31.0
156
+ urllib3==2.4.0
157
+ wheel==0.45.1
158
+ nltk==3.9.1
159
+ streamlit==1.45.1
160
+ wasabi==1.1.3
161
+ pre_commit==4.2.0
162
+ safetensors==0.5.3
163
+ jsonschema-specifications==2025.4.1
164
+ langcodes==3.5.0
165
+ GitPython==3.1.44
166
+ blinker==1.9.0
167
+ torchvision==0.21.0
168
+ MarkupSafe==3.0.2
169
+ dill==0.3.8
170
+ yacs==0.1.8
171
+ pathlib==1.0.1
172
+ scikit-learn==1.6.1
173
+ cffi==1.17.1
174
+ pycparser==2.22
175
+ flash-attn==2.7.1.post1
176
+ cryptography==45.0.3
177
+ pycryptodome==3.23.0
178
+ cheroot==10.0.1
179
+ more-itertools==10.7.0
180
+ setproctitle==1.3.6
181
+ delta-center-client==0.0.4
182
+ jmespath==0.10.0
183
+ xxhash==3.5.0
184
+ pip==25.1.1
185
+ aliyun-python-sdk-core==2.16.0
186
+ jaraco.functools==4.1.0
187
+ bigmodelvis==0.0.1
188
+ aiohttp==3.12.2
189
+ multiprocess==0.70.16
190
+ opendelta==0.3.2
191
+ docker-pycreds==0.4.0
192
+ threadpoolctl==3.6.0
193
+ click==8.2.1
194
+ oss2==2.15.0
195
+ crcmod==1.7
196
+ transformers==4.52.3
197
+ datasets==3.6.0
198
+ jsonschema==4.24.0
199
+ opencv-python==4.11.0.86
200
+ wandb==0.19.11
201
+ fsspec==2025.3.0
202
+ tokenizers==0.21.1
203
+ sentry-sdk==2.29.1
204
+ preshed==3.0.10
205
+ aliyun-python-sdk-kms==2.16.5
206
+ huggingface-hub==0.32.1
207
+ typer==0.16.0
208
+ narwhals==1.41.0
209
+ web.py==0.62
210
+ autocommand==2.2.2
211
+ importlib_metadata==8.0.0
212
+ zipp==3.19.2
213
+ jaraco.context==5.3.0
214
+ typeguard==4.3.0
215
+ jaraco.collections==5.1.0
216
+ typing_extensions==4.12.2
217
+ backports.tarfile==1.2.0
218
+ jaraco.functools==4.0.1
219
+ more-itertools==10.3.0
220
+ platformdirs==4.2.2
221
+ packaging==24.2
222
+ tomli==2.0.1
223
+ jaraco.text==3.12.1
224
+ wheel==0.45.1
225
+ inflect==7.3.1
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-metadata.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-06-28T14:23:55.239654Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage1_ckpt",
12
+ "--num_query_token",
13
+ "8",
14
+ "--plm_name",
15
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
16
+ "--bert_name",
17
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
18
+ "--save_every_n_epochs",
19
+ "5",
20
+ "--max_epochs",
21
+ "20",
22
+ "--batch_size",
23
+ "96",
24
+ "--precision",
25
+ "bf16-mixed",
26
+ "--mix_dataset",
27
+ "--num_workers",
28
+ "8",
29
+ "--use_wandb_logger",
30
+ "--strategy",
31
+ "ddp"
32
+ ],
33
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage1.py",
34
+ "codePath": "stage1.py",
35
+ "email": "gia0603yucca@gmail.com",
36
+ "root": "./all_checkpoints/stage1_ckpt/",
37
+ "host": "dsw-265304-57b7b77cbc-vwbwc",
38
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
39
+ "codePathLocal": "stage1.py",
40
+ "cpu_count": 64,
41
+ "cpu_count_logical": 64,
42
+ "gpu": "NVIDIA A800-SXM4-80GB",
43
+ "gpu_count": 8,
44
+ "disk": {
45
+ "/": {
46
+ "total": "1623302262784",
47
+ "used": "1285099520"
48
+ }
49
+ },
50
+ "memory": {
51
+ "total": "549755813888"
52
+ },
53
+ "cpu": {
54
+ "count": 64,
55
+ "countLogical": 64
56
+ },
57
+ "gpu_nvidia": [
58
+ {
59
+ "name": "NVIDIA A800-SXM4-80GB",
60
+ "memoryTotal": "85198045184",
61
+ "architecture": "Ampere"
62
+ },
63
+ {
64
+ "name": "NVIDIA A800-SXM4-80GB",
65
+ "memoryTotal": "85198045184",
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A800-SXM4-80GB",
70
+ "memoryTotal": "85198045184",
71
+ "architecture": "Ampere"
72
+ },
73
+ {
74
+ "name": "NVIDIA A800-SXM4-80GB",
75
+ "memoryTotal": "85198045184",
76
+ "architecture": "Ampere"
77
+ },
78
+ {
79
+ "name": "NVIDIA A800-SXM4-80GB",
80
+ "memoryTotal": "85198045184",
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA A800-SXM4-80GB",
85
+ "memoryTotal": "85198045184",
86
+ "architecture": "Ampere"
87
+ },
88
+ {
89
+ "name": "NVIDIA A800-SXM4-80GB",
90
+ "memoryTotal": "85198045184",
91
+ "architecture": "Ampere"
92
+ },
93
+ {
94
+ "name": "NVIDIA A800-SXM4-80GB",
95
+ "memoryTotal": "85198045184",
96
+ "architecture": "Ampere"
97
+ }
98
+ ],
99
+ "cudaVersion": "12.1"
100
+ }
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"swiss_test_inbatch_t2p_acc":30,"train_loss_ptc":5.0676679611206055,"_step":14,"onto_test_rerank_fullset_t2p_rec20":49.5,"train_loss_ptm":0.6344426870346069,"swiss_test_rerank_fullset_t2p_acc":15,"onto_test_fullset_p2t_rec20":56,"loader1/val_loss/dataloader_idx_1":9.734654426574707,"swiss_test_rerank_fullset_t2p_rec20":85,"swiss_test_inbatch_p2t_acc":25,"train_loss":9.90674877166748,"onto_test_rerank_inbatch_t2p_acc":10.5,"onto_test_inbatch_t2p_acc":11,"onto_test_rerank_inbatch_p2t_rec20":83,"onto_test_rerank_inbatch_t2p_rec20":82.5,"swiss_test_rerank_inbatch_t2p_acc":22,"swiss_test_rerank_inbatch_t2p_rec20":93,"epoch":13,"train_loss_lm":4.204638481140137,"loader1/val_loss_ptm/dataloader_idx_1":0.6397265791893005,"loader2/val_loss/dataloader_idx_2":10.584230422973633,"loader1/val_loss_ptc/dataloader_idx_1":4.00998067855835,"swiss_test_rerank_inbatch_p2t_acc":16,"onto_test_rerank_fullset_p2t_rec20":41.5,"onto_test_fullset_t2p_rec20":50.5,"_wandb":{"runtime":4652},"loader0/val_loss/dataloader_idx_0":7.924057483673096,"onto_test_rerank_fullset_t2p_acc":5,"onto_test_rerank_fullset_p2t_acc":3,"loader0/val_loss_ptc/dataloader_idx_0":2.931434154510498,"swiss_test_fullset_t2p_rec20":85,"onto_test_fullset_p2t_acc":11,"_runtime":4531.685308266,"_timestamp":1.751125166924585e+09,"loader2/val_loss_ptc/dataloader_idx_2":4.096363544464111,"loader1/val_loss_lm/dataloader_idx_1":5.084947109222412,"onto_test_inbatch_p2t_acc":19,"loader0/val_loss_ptm/dataloader_idx_0":0.6371256709098816,"onto_test_rerank_inbatch_p2t_acc":9.5,"loader0/val_loss_lm/dataloader_idx_0":4.355497360229492,"swiss_test_fullset_t2p_acc":18,"swiss_test_inbatch_t2p_rec20":93,"loader2/val_loss_ptm/dataloader_idx_2":0.641204297542572,"swiss_test_fullset_p2t_acc":21,"onto_test_fullset_t2p_acc":4.5,"swiss_test_fullset_p2t_rec20":85,"swiss_test_rerank_fullset_p2t_rec20":73,"swiss_test_rerank_fullset_p2t_acc":7,"lr":6.203955126693472e-05,"onto_test_inbatch_t2p_rec20":83,"swiss_test_rerank_inbatch_p2t_rec20":92,"loader2/val_loss_lm/dataloader_idx_2":5.846662521362305,"swiss_test_inbatch_p2t_rec20":95,"onto_test_inbatch_p2t_rec20":91,"trainer/global_step":69}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-06-28T22:23:55.245053432+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-core.log"}
2
+ {"time":"2025-06-28T22:23:56.442748363+08:00","level":"INFO","msg":"created new stream","id":"e9wtzwz1"}
3
+ {"time":"2025-06-28T22:23:56.442786823+08:00","level":"INFO","msg":"stream: started","id":"e9wtzwz1"}
4
+ {"time":"2025-06-28T22:23:56.44283909+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"e9wtzwz1"}
5
+ {"time":"2025-06-28T22:23:56.44287811+08:00","level":"INFO","msg":"sender: started","stream_id":"e9wtzwz1"}
6
+ {"time":"2025-06-28T22:23:56.442850569+08:00","level":"INFO","msg":"handler: started","stream_id":"e9wtzwz1"}
7
+ {"time":"2025-06-28T22:23:57.657067842+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-06-28T23:13:08.786733475+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:41088->172.67.193.61:443: read: connection timed out"}
9
+ {"time":"2025-06-28T23:16:23.858735046+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:58168->104.21.20.172:443: read: connection timed out"}
10
+ {"time":"2025-06-28T23:20:12.333412842+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:50886->172.67.193.61:443: read: connection reset by peer"}
11
+ {"time":"2025-06-28T23:28:29.895934993+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": unexpected EOF"}
12
+ {"time":"2025-06-28T23:32:39.731699923+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:54496->104.21.20.172:443: read: connection timed out"}
13
+ {"time":"2025-06-28T23:35:17.938724051+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:33406->172.67.193.61:443: read: connection timed out"}
14
+ {"time":"2025-06-28T23:38:54.515701632+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:59930->172.67.193.61:443: read: connection timed out"}
15
+ {"time":"2025-06-28T23:41:28.010949965+08:00","level":"INFO","msg":"stream: closing","id":"e9wtzwz1"}
16
+ {"time":"2025-06-28T23:41:28.011132748+08:00","level":"INFO","msg":"Stopping system monitor"}
17
+ {"time":"2025-06-28T23:41:28.066664522+08:00","level":"INFO","msg":"Stopped system monitor"}
18
+ {"time":"2025-06-28T23:41:37.996261564+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
19
+ {"time":"2025-06-28T23:42:36.21077519+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage1_ckpt/e9wtzwz1/file_stream\": read tcp 10.1.6.17:36806->172.67.193.61:443: read: connection timed out"}
ProtT3/all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Configure stats pid to 3589
3
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug.log
7
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage1_ckpt/wandb/run-20250628_222355-e9wtzwz1/logs/debug-internal.log
8
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():852] calling init triggers
9
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():893] starting backend
12
+ 2025-06-28 22:23:55,235 INFO MainThread:3589 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-06-28 22:23:55,237 INFO MainThread:3589 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-06-28 22:23:55,239 INFO MainThread:3589 [wandb_init.py:init():907] backend started and connected
15
+ 2025-06-28 22:23:55,240 INFO MainThread:3589 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-06-28 22:23:55,249 INFO MainThread:3589 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-06-28 22:23:57,648 INFO MainThread:3589 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-06-28 22:23:57,823 INFO MainThread:3589 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-06-28 22:23:57,827 INFO MainThread:3589 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-06-28 22:23:57,829 INFO MainThread:3589 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-06-28 23:41:27,993 INFO MsgRouterThr:3589 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 2 handles.
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/files/wandb-metadata.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-07-07T11:01:45.766473Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage2.5_mol_instruction",
12
+ "--num_query_token",
13
+ "8",
14
+ "--save_every_n_epochs",
15
+ "1",
16
+ "--max_epochs",
17
+ "10",
18
+ "--batch_size",
19
+ "1",
20
+ "--precision",
21
+ "bf16-mixed",
22
+ "--num_workers",
23
+ "8",
24
+ "--plm_model",
25
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
26
+ "--bert_name",
27
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
28
+ "--llm_name",
29
+ "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300",
30
+ "--llm_tune",
31
+ "mid_lora",
32
+ "--stage1_path",
33
+ "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt",
34
+ "--use_wandb_logger",
35
+ "--text_max_len",
36
+ "1024",
37
+ "--prot_max_len",
38
+ "2048"
39
+ ],
40
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py",
41
+ "codePath": "stage2.py",
42
+ "email": "gia0603yucca@gmail.com",
43
+ "root": "./all_checkpoints/stage2.5_mol_instruction/",
44
+ "host": "dsw-265304-7f6db6b4bb-g4b9r",
45
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
46
+ "codePathLocal": "stage2.py",
47
+ "cpu_count": 64,
48
+ "cpu_count_logical": 64,
49
+ "gpu": "NVIDIA A800-SXM4-80GB",
50
+ "gpu_count": 8,
51
+ "disk": {
52
+ "/": {
53
+ "total": "1623302262784",
54
+ "used": "1260933120"
55
+ }
56
+ },
57
+ "memory": {
58
+ "total": "549755813888"
59
+ },
60
+ "cpu": {
61
+ "count": 64,
62
+ "countLogical": 64
63
+ },
64
+ "gpu_nvidia": [
65
+ {
66
+ "name": "NVIDIA A800-SXM4-80GB",
67
+ "memoryTotal": "85198045184",
68
+ "architecture": "Ampere"
69
+ },
70
+ {
71
+ "name": "NVIDIA A800-SXM4-80GB",
72
+ "memoryTotal": "85198045184",
73
+ "architecture": "Ampere"
74
+ },
75
+ {
76
+ "name": "NVIDIA A800-SXM4-80GB",
77
+ "memoryTotal": "85198045184",
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A800-SXM4-80GB",
82
+ "memoryTotal": "85198045184",
83
+ "architecture": "Ampere"
84
+ },
85
+ {
86
+ "name": "NVIDIA A800-SXM4-80GB",
87
+ "memoryTotal": "85198045184",
88
+ "architecture": "Ampere"
89
+ },
90
+ {
91
+ "name": "NVIDIA A800-SXM4-80GB",
92
+ "memoryTotal": "85198045184",
93
+ "architecture": "Ampere"
94
+ },
95
+ {
96
+ "name": "NVIDIA A800-SXM4-80GB",
97
+ "memoryTotal": "85198045184",
98
+ "architecture": "Ampere"
99
+ },
100
+ {
101
+ "name": "NVIDIA A800-SXM4-80GB",
102
+ "memoryTotal": "85198045184",
103
+ "architecture": "Ampere"
104
+ }
105
+ ],
106
+ "cudaVersion": "12.1"
107
+ }
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-07T19:01:45.769106543+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-core.log"}
2
+ {"time":"2025-07-07T19:01:46.836668426+08:00","level":"INFO","msg":"created new stream","id":"vu5mgolt"}
3
+ {"time":"2025-07-07T19:01:46.836704412+08:00","level":"INFO","msg":"stream: started","id":"vu5mgolt"}
4
+ {"time":"2025-07-07T19:01:46.836753246+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vu5mgolt"}
5
+ {"time":"2025-07-07T19:01:46.836778842+08:00","level":"INFO","msg":"handler: started","stream_id":"vu5mgolt"}
6
+ {"time":"2025-07-07T19:01:46.836807537+08:00","level":"INFO","msg":"sender: started","stream_id":"vu5mgolt"}
7
+ {"time":"2025-07-07T19:01:48.143178072+08:00","level":"INFO","msg":"Starting system monitor"}
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Configure stats pid to 121602
3
+ 2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-07 19:01:45,758 INFO MainThread:121602 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug.log
7
+ 2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_190145-vu5mgolt/logs/debug-internal.log
8
+ 2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():893] starting backend
12
+ 2025-07-07 19:01:45,759 INFO MainThread:121602 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-07 19:01:45,760 INFO MainThread:121602 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-07 19:01:45,762 INFO MainThread:121602 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-07 19:01:45,767 INFO MainThread:121602 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-07 19:01:45,771 INFO MainThread:121602 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-07 19:01:48,130 INFO MainThread:121602 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-07 19:01:48,276 INFO MainThread:121602 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-07 19:01:48,276 INFO MainThread:121602 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-07 19:01:48,292 INFO MainThread:121602 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-07 19:01:48,292 INFO MainThread:121602 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-07 19:01:48,293 INFO MainThread:121602 [wandb_init.py:init():1150] run started, returning control to user process
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/output.log ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2.5_mol_instruction exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+ [rank: 5] Child process with PID 126090 terminated with code 1. Forcefully terminating all other processes to avoid zombies 🧟
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pydantic_core==2.33.2
2
+ psutil==7.0.0
3
+ nvidia-cuda-nvrtc-cu12==12.4.127
4
+ mpmath==1.3.0
5
+ tzdata==2025.2
6
+ contexttimer==0.3.3
7
+ cycler==0.12.1
8
+ python-magic==0.4.27
9
+ pexpect==4.9.0
10
+ sympy==1.13.1
11
+ wrapt==1.17.2
12
+ marisa-trie==1.2.1
13
+ langcodes==3.5.0
14
+ nvidia-nvtx-cu12==12.4.127
15
+ ipython==8.36.0
16
+ opencv-python-headless==4.5.5.64
17
+ MarkupSafe==3.0.2
18
+ jsonschema-specifications==2025.4.1
19
+ wasabi==1.1.3
20
+ blinker==1.9.0
21
+ cfgv==3.4.0
22
+ numpy==2.2.6
23
+ idna==3.10
24
+ nvidia-cufile-cu12==1.11.1.6
25
+ ninja==1.11.1.4
26
+ nvidia-nccl-cu12==2.21.5
27
+ networkx==3.4.2
28
+ certifi==2025.4.26
29
+ deepspeed==0.16.10+b666844f
30
+ pure_eval==0.2.3
31
+ packaging==24.2
32
+ nltk==3.9.1
33
+ contourpy==1.3.2
34
+ pre_commit==4.2.0
35
+ nodeenv==1.9.1
36
+ setuptools==78.1.1
37
+ annotated-types==0.7.0
38
+ multidict==6.4.4
39
+ requests==2.32.3
40
+ tornado==6.5.1
41
+ triton==3.2.0
42
+ pillow==11.2.1
43
+ decord==0.6.0
44
+ shellingham==1.5.4
45
+ streamlit==1.45.1
46
+ pydeck==0.9.1
47
+ confection==0.1.5
48
+ exceptiongroup==1.3.0
49
+ prompt_toolkit==3.0.51
50
+ text-unidecode==1.3
51
+ nvidia-cufft-cu12==11.2.1.3
52
+ antlr4-python3-runtime==4.9.3
53
+ fairscale==0.4.4
54
+ rouge_score==0.1.2
55
+ nvidia-cudnn-cu12==9.1.0.70
56
+ tqdm==4.67.1
57
+ rich==14.0.0
58
+ frozenlist==1.6.0
59
+ webencodings==0.5.1
60
+ altair==5.5.0
61
+ opendatasets==0.1.22
62
+ nvidia-curand-cu12==10.3.5.147
63
+ protobuf==6.31.0
64
+ asttokens==3.0.0
65
+ wheel==0.45.1
66
+ hf-xet==1.1.2
67
+ weasel==0.4.1
68
+ aiosignal==1.3.2
69
+ absl-py==2.2.2
70
+ thinc==8.3.6
71
+ torchvision==0.21.0
72
+ pandas==2.2.3
73
+ fonttools==4.58.0
74
+ bleach==6.2.0
75
+ typing-inspection==0.4.1
76
+ ftfy==6.3.1
77
+ typing_extensions==4.13.2
78
+ nvidia-ml-py==12.575.51
79
+ python-slugify==8.0.4
80
+ lightning-utilities==0.14.3
81
+ py-cpuinfo==9.0.0
82
+ smmap==5.0.2
83
+ regex==2024.11.6
84
+ scikit-image==0.25.2
85
+ iopath==0.1.10
86
+ spacy-legacy==3.0.12
87
+ hjson==3.1.0
88
+ executing==2.2.0
89
+ kiwisolver==1.4.8
90
+ scipy==1.15.3
91
+ aiohappyeyeballs==2.6.1
92
+ toml==0.10.2
93
+ jedi==0.19.2
94
+ GitPython==3.1.44
95
+ ptyprocess==0.7.0
96
+ kaggle==1.7.4.5
97
+ braceexpand==0.1.7
98
+ wcwidth==0.2.13
99
+ nvidia-cuda-runtime-cu12==12.4.127
100
+ pytorch-lightning==2.5.1.post0
101
+ Jinja2==3.1.6
102
+ urllib3==2.4.0
103
+ watchdog==6.0.0
104
+ filelock==3.18.0
105
+ propcache==0.3.1
106
+ torch==2.6.0
107
+ nvidia-cusparse-cu12==12.3.1.170
108
+ cymem==2.0.11
109
+ nvidia-cusolver-cu12==11.6.1.9
110
+ murmurhash==1.0.13
111
+ catalogue==2.0.10
112
+ yarl==1.20.0
113
+ charset-normalizer==3.4.2
114
+ gitdb==4.0.12
115
+ matplotlib==3.10.3
116
+ portalocker==3.1.1
117
+ platformdirs==4.3.8
118
+ async-timeout==5.0.1
119
+ parso==0.8.4
120
+ markdown-it-py==3.0.0
121
+ omegaconf==2.3.0
122
+ cloudpathlib==0.21.1
123
+ nvidia-cusparselt-cu12==0.6.2
124
+ spacy-loggers==1.0.5
125
+ srsly==2.5.1
126
+ identify==2.6.12
127
+ rpds-py==0.25.1
128
+ spacy==3.8.7
129
+ matplotlib-inline==0.1.7
130
+ smart-open==7.1.0
131
+ pydantic==2.11.5
132
+ mdurl==0.1.2
133
+ virtualenv==20.31.2
134
+ pytz==2025.2
135
+ pycocotools==2.0.8
136
+ six==1.17.0
137
+ decorator==5.2.1
138
+ referencing==0.36.2
139
+ sentencepiece==0.2.0
140
+ PyYAML==6.0.2
141
+ pycocoevalcap==1.2
142
+ imageio==2.37.0
143
+ distlib==0.3.9
144
+ pyarrow==20.0.0
145
+ tenacity==9.1.2
146
+ language_data==1.3.0
147
+ nvidia-cuda-cupti-cu12==12.4.127
148
+ blis==1.3.0
149
+ Pygments==2.19.1
150
+ tifffile==2025.5.10
151
+ pyparsing==3.2.3
152
+ cachetools==5.5.2
153
+ safetensors==0.5.3
154
+ attrs==25.3.0
155
+ webdataset==0.2.111
156
+ plotly==6.1.1
157
+ nvidia-cublas-cu12==12.4.5.8
158
+ timm==0.4.12
159
+ torchmetrics==1.7.1
160
+ nvidia-nvjitlink-cu12==12.4.127
161
+ stack-data==0.6.3
162
+ python-dateutil==2.9.0.post0
163
+ lazy_loader==0.4
164
+ traitlets==5.14.3
165
+ einops==0.8.1
166
+ salesforce-lavis==1.0.2
167
+ joblib==1.5.1
168
+ msgpack==1.1.0
169
+ tokenizers==0.21.1
170
+ sentry-sdk==2.29.1
171
+ oss2==2.15.0
172
+ setproctitle==1.3.6
173
+ pip==25.1.1
174
+ cffi==1.17.1
175
+ transformers==4.52.3
176
+ narwhals==1.41.0
177
+ aliyun-python-sdk-core==2.16.0
178
+ jsonschema==4.24.0
179
+ flash-attn==2.7.1.post1
180
+ preshed==3.0.10
181
+ multiprocess==0.70.16
182
+ cryptography==45.0.3
183
+ aliyun-python-sdk-kms==2.16.5
184
+ scikit-learn==1.6.1
185
+ huggingface-hub==0.32.1
186
+ crcmod==1.7
187
+ typer==0.16.0
188
+ web.py==0.62
189
+ docker-pycreds==0.4.0
190
+ xxhash==3.5.0
191
+ bigmodelvis==0.0.1
192
+ datasets==3.6.0
193
+ more-itertools==10.7.0
194
+ yacs==0.1.8
195
+ jmespath==0.10.0
196
+ aiohttp==3.12.2
197
+ opencv-python==4.11.0.86
198
+ pycparser==2.22
199
+ threadpoolctl==3.6.0
200
+ jaraco.functools==4.1.0
201
+ click==8.2.1
202
+ wandb==0.19.11
203
+ opendelta==0.3.2
204
+ pycryptodome==3.23.0
205
+ pathlib==1.0.1
206
+ dill==0.3.8
207
+ fsspec==2025.3.0
208
+ delta-center-client==0.0.4
209
+ cheroot==10.0.1
210
+ typing_extensions==4.12.2
211
+ platformdirs==4.2.2
212
+ jaraco.text==3.12.1
213
+ packaging==24.2
214
+ inflect==7.3.1
215
+ jaraco.context==5.3.0
216
+ wheel==0.45.1
217
+ typeguard==4.3.0
218
+ more-itertools==10.3.0
219
+ tomli==2.0.1
220
+ importlib_metadata==8.0.0
221
+ backports.tarfile==1.2.0
222
+ zipp==3.19.2
223
+ jaraco.collections==5.1.0
224
+ autocommand==2.2.2
225
+ jaraco.functools==4.0.1
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/files/wandb-metadata.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-07-07T11:53:53.846220Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage2.5_mol_instruction",
12
+ "--num_query_token",
13
+ "8",
14
+ "--save_every_n_epochs",
15
+ "1",
16
+ "--max_epochs",
17
+ "10",
18
+ "--batch_size",
19
+ "1",
20
+ "--precision",
21
+ "bf16-mixed",
22
+ "--num_workers",
23
+ "8",
24
+ "--plm_model",
25
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
26
+ "--bert_name",
27
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
28
+ "--llm_name",
29
+ "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300",
30
+ "--llm_tune",
31
+ "mid_lora",
32
+ "--stage1_path",
33
+ "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt",
34
+ "--use_wandb_logger",
35
+ "--text_max_len",
36
+ "1024",
37
+ "--prot_max_len",
38
+ "2048"
39
+ ],
40
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py",
41
+ "codePath": "stage2.py",
42
+ "email": "gia0603yucca@gmail.com",
43
+ "root": "./all_checkpoints/stage2.5_mol_instruction/",
44
+ "host": "dsw-265304-7f6db6b4bb-g4b9r",
45
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
46
+ "codePathLocal": "stage2.py",
47
+ "cpu_count": 64,
48
+ "cpu_count_logical": 64,
49
+ "gpu": "NVIDIA A800-SXM4-80GB",
50
+ "gpu_count": 8,
51
+ "disk": {
52
+ "/": {
53
+ "total": "1623302262784",
54
+ "used": "1260957696"
55
+ }
56
+ },
57
+ "memory": {
58
+ "total": "549755813888"
59
+ },
60
+ "cpu": {
61
+ "count": 64,
62
+ "countLogical": 64
63
+ },
64
+ "gpu_nvidia": [
65
+ {
66
+ "name": "NVIDIA A800-SXM4-80GB",
67
+ "memoryTotal": "85198045184",
68
+ "architecture": "Ampere"
69
+ },
70
+ {
71
+ "name": "NVIDIA A800-SXM4-80GB",
72
+ "memoryTotal": "85198045184",
73
+ "architecture": "Ampere"
74
+ },
75
+ {
76
+ "name": "NVIDIA A800-SXM4-80GB",
77
+ "memoryTotal": "85198045184",
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A800-SXM4-80GB",
82
+ "memoryTotal": "85198045184",
83
+ "architecture": "Ampere"
84
+ },
85
+ {
86
+ "name": "NVIDIA A800-SXM4-80GB",
87
+ "memoryTotal": "85198045184",
88
+ "architecture": "Ampere"
89
+ },
90
+ {
91
+ "name": "NVIDIA A800-SXM4-80GB",
92
+ "memoryTotal": "85198045184",
93
+ "architecture": "Ampere"
94
+ },
95
+ {
96
+ "name": "NVIDIA A800-SXM4-80GB",
97
+ "memoryTotal": "85198045184",
98
+ "architecture": "Ampere"
99
+ },
100
+ {
101
+ "name": "NVIDIA A800-SXM4-80GB",
102
+ "memoryTotal": "85198045184",
103
+ "architecture": "Ampere"
104
+ }
105
+ ],
106
+ "cudaVersion": "12.1"
107
+ }
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-07T19:53:53.851667884+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-core.log"}
2
+ {"time":"2025-07-07T19:53:55.917977588+08:00","level":"INFO","msg":"created new stream","id":"qhvlkre6"}
3
+ {"time":"2025-07-07T19:53:55.918022111+08:00","level":"INFO","msg":"stream: started","id":"qhvlkre6"}
4
+ {"time":"2025-07-07T19:53:55.918057138+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"qhvlkre6"}
5
+ {"time":"2025-07-07T19:53:55.918080879+08:00","level":"INFO","msg":"handler: started","stream_id":"qhvlkre6"}
6
+ {"time":"2025-07-07T19:53:55.918147188+08:00","level":"INFO","msg":"sender: started","stream_id":"qhvlkre6"}
7
+ {"time":"2025-07-07T19:53:57.234333371+08:00","level":"INFO","msg":"Starting system monitor"}
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Configure stats pid to 125661
3
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug.log
7
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/logs/debug-internal.log
8
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():893] starting backend
12
+ 2025-07-07 19:53:53,840 INFO MainThread:125661 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-07 19:53:53,842 INFO MainThread:125661 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-07 19:53:53,846 INFO MainThread:125661 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-07 19:53:53,847 INFO MainThread:125661 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-07 19:53:53,852 INFO MainThread:125661 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-07 19:53:57,184 INFO MainThread:125661 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-07 19:53:57,375 INFO MainThread:125661 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-07 19:53:57,375 INFO MainThread:125661 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-07 19:53:57,379 INFO MainThread:125661 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-07 19:53:57,379 INFO MainThread:125661 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-07 19:53:57,380 INFO MainThread:125661 [wandb_init.py:init():1150] run started, returning control to user process
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_195353-qhvlkre6/run-qhvlkre6.wandb ADDED
File without changes
ProtT3/all_checkpoints/stage2.5_mol_instruction/wandb/run-20250707_200229-yex1pcwt/files/config.yaml ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": epoch
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": trainer/global_step
12
+ "6":
13
+ - 3
14
+ "7": []
15
+ - "1": dataset0/rouge_2
16
+ "5": 2
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": dataset0/bleu4
22
+ "5": 2
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": dataset0/rouge_1
28
+ "5": 2
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": dataloader0/val loss/dataloader_idx_0
34
+ "5": 2
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": dataset0/acc
40
+ "5": 2
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": dataset0/rouge_l
46
+ "5": 2
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": dataset0/bleu2
52
+ "5": 2
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": dataset0/meteor_score
58
+ "5": 2
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": loss
64
+ "5": 2
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": lr
70
+ "5": 2
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ python_version: 3.10.0
76
+ t:
77
+ "1":
78
+ - 1
79
+ - 5
80
+ - 9
81
+ - 11
82
+ - 33
83
+ - 41
84
+ - 49
85
+ - 53
86
+ - 55
87
+ - 63
88
+ - 103
89
+ "2":
90
+ - 1
91
+ - 5
92
+ - 9
93
+ - 11
94
+ - 33
95
+ - 41
96
+ - 49
97
+ - 53
98
+ - 55
99
+ - 63
100
+ - 103
101
+ "3":
102
+ - 7
103
+ - 23
104
+ - 55
105
+ - 66
106
+ "4": 3.10.0
107
+ "5": 0.19.11
108
+ "6": 4.52.3
109
+ "8":
110
+ - 5
111
+ "12": 0.19.11
112
+ "13": linux-x86_64
113
+ a_max_len:
114
+ value: 36
115
+ accelerator:
116
+ value: gpu
117
+ accumulate_grad_batches:
118
+ value: 1
119
+ batch_size:
120
+ value: 2
121
+ bert_name:
122
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
123
+ caption_eval_epoch:
124
+ value: 10
125
+ check_val_every_n_epoch:
126
+ value: 1
127
+ cross_attention_freq:
128
+ value: 2
129
+ devices:
130
+ value: 0,1,2,3,4,5,6,7
131
+ do_sample:
132
+ value: false
133
+ enable_flash:
134
+ value: false
135
+ enbale_gradient_checkpointing:
136
+ value: false
137
+ filename:
138
+ value: stage2.5_mol_instruction
139
+ filter_side_qa:
140
+ value: false
141
+ inference_batch_size:
142
+ value: 4
143
+ init_checkpoint:
144
+ value: ""
145
+ init_lr:
146
+ value: 0.0001
147
+ llm_name:
148
+ value: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300
149
+ llm_tune:
150
+ value: mid_lora
151
+ lora_alpha:
152
+ value: 16
153
+ lora_dropout:
154
+ value: 0.1
155
+ lora_r:
156
+ value: 8
157
+ lr_decay_rate:
158
+ value: 0.9
159
+ max_epochs:
160
+ value: 10
161
+ max_inference_len:
162
+ value: 128
163
+ min_inference_len:
164
+ value: 1
165
+ min_lr:
166
+ value: 1e-05
167
+ mix_dataset:
168
+ value: false
169
+ mode:
170
+ value: train
171
+ num_beams:
172
+ value: 5
173
+ num_query_token:
174
+ value: 8
175
+ num_workers:
176
+ value: 8
177
+ peft_config:
178
+ value: ""
179
+ peft_dir:
180
+ value: ""
181
+ plm_model:
182
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
183
+ plm_tune:
184
+ value: freeze
185
+ precision:
186
+ value: bf16-mixed
187
+ prompt:
188
+ value: 'The protein has the following properties: '
189
+ prot_max_len:
190
+ value: 1024
191
+ q_max_len:
192
+ value: 29
193
+ root:
194
+ value: data
195
+ save_every_n_epochs:
196
+ value: 1
197
+ scheduler:
198
+ value: linear_warmup_cosine_lr
199
+ seed:
200
+ value: 42
201
+ stage1_path:
202
+ value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt
203
+ stage2_path:
204
+ value: ""
205
+ strategy:
206
+ value: deepspeed
207
+ text_max_len:
208
+ value: 1024
209
+ use_wandb_logger:
210
+ value: true
211
+ warmup_lr:
212
+ value: 1e-06
213
+ warmup_steps:
214
+ value: 1000
215
+ weight_decay:
216
+ value: 0.05
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Configure stats pid to 50671
3
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug.log
7
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_153250-690krh73/logs/debug-internal.log
8
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-04 15:32:50,807 INFO MainThread:50671 [wandb_init.py:init():893] starting backend
12
+ 2025-07-04 15:32:50,808 INFO MainThread:50671 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-04 15:32:50,809 INFO MainThread:50671 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-04 15:32:50,809 INFO MainThread:50671 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-04 15:32:50,810 INFO MainThread:50671 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-04 15:32:50,811 INFO MainThread:50671 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-04 15:32:53,521 INFO MainThread:50671 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-04 15:32:53,737 INFO MainThread:50671 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-04 15:32:53,737 INFO MainThread:50671 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-04 15:32:53,744 INFO MainThread:50671 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-04 15:32:53,744 INFO MainThread:50671 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-04 15:32:53,764 INFO MainThread:50671 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-07-04 15:33:02,588 INFO MainThread:50671 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07041521', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 15, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 128, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
24
+ 2025-07-04 15:33:07,743 INFO MsgRouterThr:50671 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/config.yaml ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.11
4
+ m:
5
+ - "1": dataloader2/val loss/dataloader_idx_2
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": trainer/global_step
12
+ "6":
13
+ - 3
14
+ "7": []
15
+ - "1": dataloader0/val loss/dataloader_idx_0
16
+ "5": 2
17
+ "6":
18
+ - 1
19
+ - 3
20
+ "7": []
21
+ - "1": dataset0/meteor_score
22
+ "5": 2
23
+ "6":
24
+ - 1
25
+ - 3
26
+ "7": []
27
+ - "1": dataset0/rouge_l
28
+ "5": 2
29
+ "6":
30
+ - 1
31
+ - 3
32
+ "7": []
33
+ - "1": dataset0/acc
34
+ "5": 2
35
+ "6":
36
+ - 1
37
+ - 3
38
+ "7": []
39
+ - "1": lr
40
+ "5": 2
41
+ "6":
42
+ - 1
43
+ - 3
44
+ "7": []
45
+ - "1": dataset0/rouge_2
46
+ "5": 2
47
+ "6":
48
+ - 1
49
+ - 3
50
+ "7": []
51
+ - "1": dataset0/bleu2
52
+ "5": 2
53
+ "6":
54
+ - 1
55
+ - 3
56
+ "7": []
57
+ - "1": dataset0/bleu4
58
+ "5": 2
59
+ "6":
60
+ - 1
61
+ - 3
62
+ "7": []
63
+ - "1": dataset0/rouge_1
64
+ "5": 2
65
+ "6":
66
+ - 1
67
+ - 3
68
+ "7": []
69
+ - "1": loss
70
+ "5": 2
71
+ "6":
72
+ - 1
73
+ - 3
74
+ "7": []
75
+ - "1": epoch
76
+ "5": 2
77
+ "6":
78
+ - 1
79
+ - 3
80
+ "7": []
81
+ python_version: 3.10.0
82
+ t:
83
+ "1":
84
+ - 1
85
+ - 5
86
+ - 9
87
+ - 11
88
+ - 33
89
+ - 41
90
+ - 49
91
+ - 53
92
+ - 55
93
+ - 63
94
+ - 103
95
+ "2":
96
+ - 1
97
+ - 5
98
+ - 9
99
+ - 11
100
+ - 33
101
+ - 41
102
+ - 49
103
+ - 53
104
+ - 55
105
+ - 63
106
+ - 103
107
+ "3":
108
+ - 7
109
+ - 23
110
+ - 55
111
+ - 66
112
+ "4": 3.10.0
113
+ "5": 0.19.11
114
+ "6": 4.52.3
115
+ "8":
116
+ - 5
117
+ "12": 0.19.11
118
+ "13": linux-x86_64
119
+ a_max_len:
120
+ value: 36
121
+ accelerator:
122
+ value: gpu
123
+ accumulate_grad_batches:
124
+ value: 1
125
+ batch_size:
126
+ value: 32
127
+ bert_name:
128
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft
129
+ caption_eval_epoch:
130
+ value: 10
131
+ check_val_every_n_epoch:
132
+ value: 1
133
+ cross_attention_freq:
134
+ value: 2
135
+ devices:
136
+ value: 0,1,2,3,4,5,6,7
137
+ do_sample:
138
+ value: false
139
+ enable_flash:
140
+ value: false
141
+ enbale_gradient_checkpointing:
142
+ value: false
143
+ filename:
144
+ value: stage2_07041521
145
+ filter_side_qa:
146
+ value: false
147
+ inference_batch_size:
148
+ value: 4
149
+ init_checkpoint:
150
+ value: ""
151
+ init_lr:
152
+ value: 0.0001
153
+ llm_name:
154
+ value: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300
155
+ llm_tune:
156
+ value: mid_lora
157
+ lora_alpha:
158
+ value: 16
159
+ lora_dropout:
160
+ value: 0.1
161
+ lora_r:
162
+ value: 8
163
+ lr_decay_rate:
164
+ value: 0.9
165
+ max_epochs:
166
+ value: 15
167
+ max_inference_len:
168
+ value: 128
169
+ min_inference_len:
170
+ value: 1
171
+ min_lr:
172
+ value: 1e-05
173
+ mix_dataset:
174
+ value: true
175
+ mode:
176
+ value: train
177
+ num_beams:
178
+ value: 5
179
+ num_query_token:
180
+ value: 8
181
+ num_workers:
182
+ value: 8
183
+ peft_config:
184
+ value: ""
185
+ peft_dir:
186
+ value: ""
187
+ plm_model:
188
+ value: /nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m
189
+ plm_tune:
190
+ value: freeze
191
+ precision:
192
+ value: bf16-mixed
193
+ prompt:
194
+ value: 'The protein has the following properties: '
195
+ prot_max_len:
196
+ value: 1024
197
+ q_max_len:
198
+ value: 29
199
+ root:
200
+ value: data
201
+ save_every_n_epochs:
202
+ value: 5
203
+ scheduler:
204
+ value: linear_warmup_cosine_lr
205
+ seed:
206
+ value: 42
207
+ stage1_path:
208
+ value: /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt
209
+ stage2_path:
210
+ value: ""
211
+ strategy:
212
+ value: deepspeed
213
+ text_max_len:
214
+ value: 128
215
+ use_wandb_logger:
216
+ value: true
217
+ warmup_lr:
218
+ value: 1e-06
219
+ warmup_steps:
220
+ value: 1000
221
+ weight_decay:
222
+ value: 0.05
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/output.log ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage2_07041521 exists and is not empty.
2
+ Enabling DeepSpeed BF16. Model parameters and inputs will be cast to `bfloat16`.
3
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
4
+
5
+ | Name | Type | Params | Mode
6
+ -------------------------------------------
7
+ 0 | blip2 | Blip2OPT | 7.9 B | train
8
+ -------------------------------------------
9
+ 104 M Trainable params
10
+ 7.8 B Non-trainable params
11
+ 7.9 B Total params
12
+ 31,459.025Total estimated model params size (MB)
13
+ 174 Modules in train mode
14
+ 1203 Modules in eval mode
15
+ Epoch 9: 100%|██████████████████████████████████████████| 1682/1682 [34:15<00:00, 0.82it/s, v_num=rt6r]BLEU-2 score: 14.521351656885983
16
+ BLEU-4 score: 12.023162430085268██████████████████████████████████████| 313/313 [20:52<00:00, 0.25it/s]
17
+ /nas/shared/kilab/wangyujia/ProtT3/model/dist_funs.py:18: FutureWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/main/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
18
+ sd = self.module.state_dict(destination, prefix, keep_vars)
19
+ 20000it [01:44, 191.35it/s]
20
+ 20000it [00:23, 848.56it/s]
21
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
22
+ Average Meteor score: 20.348670863196457
23
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
24
+ ROUGE score:
25
+ rouge1: 20.24471429685868
26
+ rouge2: 13.338771592014881
27
+ rougeL: 18.14378222211823
28
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/bleu4', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
29
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_1', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
30
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_2', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
31
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/rouge_l', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
32
+ /root/miniconda3/envs/protT3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:434: It is recommended to use `self.log('dataset0/meteor_score', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
33
+ Epoch 14: 100%|█████████████████████████████████████████| 1682/1682 [35:23<00:00, 0.79it/s, v_num=rt6r]
34
+
35
+ `Trainer.fit` stopped: `max_epochs=15` reached.
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pydantic_core==2.33.2
2
+ psutil==7.0.0
3
+ nvidia-cuda-nvrtc-cu12==12.4.127
4
+ mpmath==1.3.0
5
+ tzdata==2025.2
6
+ contexttimer==0.3.3
7
+ cycler==0.12.1
8
+ python-magic==0.4.27
9
+ pexpect==4.9.0
10
+ sympy==1.13.1
11
+ wrapt==1.17.2
12
+ marisa-trie==1.2.1
13
+ langcodes==3.5.0
14
+ nvidia-nvtx-cu12==12.4.127
15
+ ipython==8.36.0
16
+ opencv-python-headless==4.5.5.64
17
+ MarkupSafe==3.0.2
18
+ jsonschema-specifications==2025.4.1
19
+ wasabi==1.1.3
20
+ blinker==1.9.0
21
+ cfgv==3.4.0
22
+ numpy==2.2.6
23
+ idna==3.10
24
+ nvidia-cufile-cu12==1.11.1.6
25
+ ninja==1.11.1.4
26
+ nvidia-nccl-cu12==2.21.5
27
+ networkx==3.4.2
28
+ certifi==2025.4.26
29
+ deepspeed==0.16.10+b666844f
30
+ pure_eval==0.2.3
31
+ packaging==24.2
32
+ nltk==3.9.1
33
+ contourpy==1.3.2
34
+ pre_commit==4.2.0
35
+ nodeenv==1.9.1
36
+ setuptools==78.1.1
37
+ annotated-types==0.7.0
38
+ multidict==6.4.4
39
+ requests==2.32.3
40
+ tornado==6.5.1
41
+ triton==3.2.0
42
+ pillow==11.2.1
43
+ decord==0.6.0
44
+ shellingham==1.5.4
45
+ streamlit==1.45.1
46
+ pydeck==0.9.1
47
+ confection==0.1.5
48
+ exceptiongroup==1.3.0
49
+ prompt_toolkit==3.0.51
50
+ text-unidecode==1.3
51
+ nvidia-cufft-cu12==11.2.1.3
52
+ antlr4-python3-runtime==4.9.3
53
+ fairscale==0.4.4
54
+ rouge_score==0.1.2
55
+ nvidia-cudnn-cu12==9.1.0.70
56
+ tqdm==4.67.1
57
+ rich==14.0.0
58
+ frozenlist==1.6.0
59
+ webencodings==0.5.1
60
+ altair==5.5.0
61
+ opendatasets==0.1.22
62
+ nvidia-curand-cu12==10.3.5.147
63
+ protobuf==6.31.0
64
+ asttokens==3.0.0
65
+ wheel==0.45.1
66
+ hf-xet==1.1.2
67
+ weasel==0.4.1
68
+ aiosignal==1.3.2
69
+ absl-py==2.2.2
70
+ thinc==8.3.6
71
+ torchvision==0.21.0
72
+ pandas==2.2.3
73
+ fonttools==4.58.0
74
+ bleach==6.2.0
75
+ typing-inspection==0.4.1
76
+ ftfy==6.3.1
77
+ typing_extensions==4.13.2
78
+ nvidia-ml-py==12.575.51
79
+ python-slugify==8.0.4
80
+ lightning-utilities==0.14.3
81
+ py-cpuinfo==9.0.0
82
+ smmap==5.0.2
83
+ regex==2024.11.6
84
+ scikit-image==0.25.2
85
+ iopath==0.1.10
86
+ spacy-legacy==3.0.12
87
+ hjson==3.1.0
88
+ executing==2.2.0
89
+ kiwisolver==1.4.8
90
+ scipy==1.15.3
91
+ aiohappyeyeballs==2.6.1
92
+ toml==0.10.2
93
+ jedi==0.19.2
94
+ GitPython==3.1.44
95
+ ptyprocess==0.7.0
96
+ kaggle==1.7.4.5
97
+ braceexpand==0.1.7
98
+ wcwidth==0.2.13
99
+ nvidia-cuda-runtime-cu12==12.4.127
100
+ pytorch-lightning==2.5.1.post0
101
+ Jinja2==3.1.6
102
+ urllib3==2.4.0
103
+ watchdog==6.0.0
104
+ filelock==3.18.0
105
+ propcache==0.3.1
106
+ torch==2.6.0
107
+ nvidia-cusparse-cu12==12.3.1.170
108
+ cymem==2.0.11
109
+ nvidia-cusolver-cu12==11.6.1.9
110
+ murmurhash==1.0.13
111
+ catalogue==2.0.10
112
+ yarl==1.20.0
113
+ charset-normalizer==3.4.2
114
+ gitdb==4.0.12
115
+ matplotlib==3.10.3
116
+ portalocker==3.1.1
117
+ platformdirs==4.3.8
118
+ async-timeout==5.0.1
119
+ parso==0.8.4
120
+ markdown-it-py==3.0.0
121
+ omegaconf==2.3.0
122
+ cloudpathlib==0.21.1
123
+ nvidia-cusparselt-cu12==0.6.2
124
+ spacy-loggers==1.0.5
125
+ srsly==2.5.1
126
+ identify==2.6.12
127
+ rpds-py==0.25.1
128
+ spacy==3.8.7
129
+ matplotlib-inline==0.1.7
130
+ smart-open==7.1.0
131
+ pydantic==2.11.5
132
+ mdurl==0.1.2
133
+ virtualenv==20.31.2
134
+ pytz==2025.2
135
+ pycocotools==2.0.8
136
+ six==1.17.0
137
+ decorator==5.2.1
138
+ referencing==0.36.2
139
+ sentencepiece==0.2.0
140
+ PyYAML==6.0.2
141
+ pycocoevalcap==1.2
142
+ imageio==2.37.0
143
+ distlib==0.3.9
144
+ pyarrow==20.0.0
145
+ tenacity==9.1.2
146
+ language_data==1.3.0
147
+ nvidia-cuda-cupti-cu12==12.4.127
148
+ blis==1.3.0
149
+ Pygments==2.19.1
150
+ tifffile==2025.5.10
151
+ pyparsing==3.2.3
152
+ cachetools==5.5.2
153
+ safetensors==0.5.3
154
+ attrs==25.3.0
155
+ webdataset==0.2.111
156
+ plotly==6.1.1
157
+ nvidia-cublas-cu12==12.4.5.8
158
+ timm==0.4.12
159
+ torchmetrics==1.7.1
160
+ nvidia-nvjitlink-cu12==12.4.127
161
+ stack-data==0.6.3
162
+ python-dateutil==2.9.0.post0
163
+ lazy_loader==0.4
164
+ traitlets==5.14.3
165
+ einops==0.8.1
166
+ salesforce-lavis==1.0.2
167
+ joblib==1.5.1
168
+ msgpack==1.1.0
169
+ tokenizers==0.21.1
170
+ sentry-sdk==2.29.1
171
+ oss2==2.15.0
172
+ setproctitle==1.3.6
173
+ pip==25.1.1
174
+ cffi==1.17.1
175
+ transformers==4.52.3
176
+ narwhals==1.41.0
177
+ aliyun-python-sdk-core==2.16.0
178
+ jsonschema==4.24.0
179
+ flash-attn==2.7.1.post1
180
+ preshed==3.0.10
181
+ multiprocess==0.70.16
182
+ cryptography==45.0.3
183
+ aliyun-python-sdk-kms==2.16.5
184
+ scikit-learn==1.6.1
185
+ huggingface-hub==0.32.1
186
+ crcmod==1.7
187
+ typer==0.16.0
188
+ web.py==0.62
189
+ docker-pycreds==0.4.0
190
+ xxhash==3.5.0
191
+ bigmodelvis==0.0.1
192
+ datasets==3.6.0
193
+ more-itertools==10.7.0
194
+ yacs==0.1.8
195
+ jmespath==0.10.0
196
+ aiohttp==3.12.2
197
+ opencv-python==4.11.0.86
198
+ pycparser==2.22
199
+ threadpoolctl==3.6.0
200
+ jaraco.functools==4.1.0
201
+ click==8.2.1
202
+ wandb==0.19.11
203
+ opendelta==0.3.2
204
+ pycryptodome==3.23.0
205
+ pathlib==1.0.1
206
+ dill==0.3.8
207
+ fsspec==2025.3.0
208
+ delta-center-client==0.0.4
209
+ cheroot==10.0.1
210
+ typing_extensions==4.12.2
211
+ platformdirs==4.2.2
212
+ jaraco.text==3.12.1
213
+ packaging==24.2
214
+ inflect==7.3.1
215
+ jaraco.context==5.3.0
216
+ wheel==0.45.1
217
+ typeguard==4.3.0
218
+ more-itertools==10.3.0
219
+ tomli==2.0.1
220
+ importlib_metadata==8.0.0
221
+ backports.tarfile==1.2.0
222
+ zipp==3.19.2
223
+ jaraco.collections==5.1.0
224
+ autocommand==2.2.2
225
+ jaraco.functools==4.0.1
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-metadata.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.0",
4
+ "startedAt": "2025-07-04T07:46:08.916822Z",
5
+ "args": [
6
+ "--devices",
7
+ "0,1,2,3,4,5,6,7",
8
+ "--mode",
9
+ "train",
10
+ "--filename",
11
+ "stage2_07041521",
12
+ "--num_query_token",
13
+ "8",
14
+ "--save_every_n_epochs",
15
+ "5",
16
+ "--max_epochs",
17
+ "15",
18
+ "--batch_size",
19
+ "32",
20
+ "--precision",
21
+ "bf16-mixed",
22
+ "--num_workers",
23
+ "8",
24
+ "--plm_model",
25
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m",
26
+ "--bert_name",
27
+ "/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft",
28
+ "--llm_name",
29
+ "/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300",
30
+ "--llm_tune",
31
+ "mid_lora",
32
+ "--mix_dataset",
33
+ "--stage1_path",
34
+ "/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt",
35
+ "--use_wandb_logger"
36
+ ],
37
+ "program": "/nas/shared/kilab/wangyujia/ProtT3/stage2.py",
38
+ "codePath": "stage2.py",
39
+ "email": "gia0603yucca@gmail.com",
40
+ "root": "./all_checkpoints/stage2_07041521/",
41
+ "host": "dsw-265304-b8d7644bb-bs7r7",
42
+ "executable": "/root/miniconda3/envs/protT3/bin/python",
43
+ "codePathLocal": "stage2.py",
44
+ "cpu_count": 64,
45
+ "cpu_count_logical": 64,
46
+ "gpu": "NVIDIA A800-SXM4-80GB",
47
+ "gpu_count": 8,
48
+ "disk": {
49
+ "/": {
50
+ "total": "1623302262784",
51
+ "used": "1266618368"
52
+ }
53
+ },
54
+ "memory": {
55
+ "total": "549755813888"
56
+ },
57
+ "cpu": {
58
+ "count": 64,
59
+ "countLogical": 64
60
+ },
61
+ "gpu_nvidia": [
62
+ {
63
+ "name": "NVIDIA A800-SXM4-80GB",
64
+ "memoryTotal": "85198045184",
65
+ "architecture": "Ampere"
66
+ },
67
+ {
68
+ "name": "NVIDIA A800-SXM4-80GB",
69
+ "memoryTotal": "85198045184",
70
+ "architecture": "Ampere"
71
+ },
72
+ {
73
+ "name": "NVIDIA A800-SXM4-80GB",
74
+ "memoryTotal": "85198045184",
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA A800-SXM4-80GB",
79
+ "memoryTotal": "85198045184",
80
+ "architecture": "Ampere"
81
+ },
82
+ {
83
+ "name": "NVIDIA A800-SXM4-80GB",
84
+ "memoryTotal": "85198045184",
85
+ "architecture": "Ampere"
86
+ },
87
+ {
88
+ "name": "NVIDIA A800-SXM4-80GB",
89
+ "memoryTotal": "85198045184",
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A800-SXM4-80GB",
94
+ "memoryTotal": "85198045184",
95
+ "architecture": "Ampere"
96
+ },
97
+ {
98
+ "name": "NVIDIA A800-SXM4-80GB",
99
+ "memoryTotal": "85198045184",
100
+ "architecture": "Ampere"
101
+ }
102
+ ],
103
+ "cudaVersion": "12.1"
104
+ }
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"dataset0/rouge_2":13.33877182006836,"epoch":14,"dataset0/bleu4":12.023162841796875,"dataset0/bleu2":14.52135181427002,"dataset0/acc":0,"dataset0/rouge_l":18.143781661987305,"dataloader2/val loss/dataloader_idx_2":2.210709571838379,"_step":518,"loss":0.12754811346530914,"dataset0/meteor_score":20.348670959472656,"_wandb":{"runtime":34542},"lr":1.0983357242366765e-05,"dataset0/rouge_1":20.244714736938477,"trainer/global_step":25229,"_timestamp":1.7516496969635456e+09,"dataloader0/val loss/dataloader_idx_0":0.3717030882835388,"_runtime":34528.047018257}
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-internal.log ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-04T15:46:08.966654664+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-core.log"}
2
+ {"time":"2025-07-04T15:46:10.62729617+08:00","level":"INFO","msg":"created new stream","id":"ds7lrt6r"}
3
+ {"time":"2025-07-04T15:46:10.627339189+08:00","level":"INFO","msg":"stream: started","id":"ds7lrt6r"}
4
+ {"time":"2025-07-04T15:46:10.627374947+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"ds7lrt6r"}
5
+ {"time":"2025-07-04T15:46:10.627404904+08:00","level":"INFO","msg":"handler: started","stream_id":"ds7lrt6r"}
6
+ {"time":"2025-07-04T15:46:10.627398441+08:00","level":"INFO","msg":"sender: started","stream_id":"ds7lrt6r"}
7
+ {"time":"2025-07-04T15:46:12.482815718+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-07-04T21:58:22.739226816+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:36512->172.67.193.61:443: read: connection timed out"}
9
+ {"time":"2025-07-04T22:01:34.73927758+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56674->172.67.193.61:443: read: connection timed out"}
10
+ {"time":"2025-07-04T22:01:43.328655381+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
11
+ {"time":"2025-07-04T22:02:15.686026868+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-07-04T22:02:50.214808897+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
13
+ {"time":"2025-07-04T22:03:28.950188225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
14
+ {"time":"2025-07-04T22:04:14.982810813+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
15
+ {"time":"2025-07-04T22:05:22.734102314+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
16
+ {"time":"2025-07-04T22:06:52.750047504+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
17
+ {"time":"2025-07-04T22:08:22.775414283+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
18
+ {"time":"2025-07-04T22:08:43.307492899+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
19
+ {"time":"2025-07-04T22:09:52.781192276+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
20
+ {"time":"2025-07-04T22:11:13.328660156+08:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000386663,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"2nedcn0bl5yp\" connection_id:\"127.0.0.1:57318\")"}
21
+ {"time":"2025-07-04T22:11:22.78819562+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
22
+ {"time":"2025-07-04T22:12:52.789417986+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
23
+ {"time":"2025-07-04T22:13:05.234783312+08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000850585,"work":"WorkRecord(*service_go_proto.Record_OutputRaw); Control(connection_id:\"127.0.0.1:57318\")"}
24
+ {"time":"2025-07-04T22:13:12.48514539+08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.000329899,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
25
+ {"time":"2025-07-04T22:13:12.516285255+08:00","level":"WARN","msg":"runwork: taking a long time","seconds":600.00042574,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
26
+ {"time":"2025-07-04T22:14:22.815202117+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
27
+ {"time":"2025-07-04T22:15:52.82570124+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
28
+ {"time":"2025-07-04T22:17:22.890129793+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
29
+ {"time":"2025-07-04T22:18:29.532924447+08:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":1036.204664967,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"2nedcn0bl5yp\" connection_id:\"127.0.0.1:57318\")"}
30
+ {"time":"2025-07-04T22:18:29.532969506+08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":924.299032896,"work":"WorkRecord(*service_go_proto.Record_OutputRaw); Control(connection_id:\"127.0.0.1:57318\")"}
31
+ {"time":"2025-07-04T22:18:29.532991216+08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":917.048223115,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
32
+ {"time":"2025-07-04T22:18:29.532997449+08:00","level":"INFO","msg":"runwork: succeeded after taking longer than expected","seconds":917.017162756,"work":"WorkRecord(*service_go_proto.Record_Stats); Control(always_send:true)"}
33
+ {"time":"2025-07-04T22:22:02.242147225+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:53384->104.21.20.172:443: read: connection reset by peer"}
34
+ {"time":"2025-07-04T22:26:53.14669432+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
35
+ {"time":"2025-07-04T22:27:48.780316277+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
36
+ {"time":"2025-07-04T22:31:07.795211328+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:59586->104.21.20.172:443: read: connection timed out"}
37
+ {"time":"2025-07-04T22:34:24.403211244+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:36792->172.67.193.61:443: read: connection timed out"}
38
+ {"time":"2025-07-04T22:42:22.611231819+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:52098->172.67.193.61:443: read: connection timed out"}
39
+ {"time":"2025-07-04T22:43:30.389313147+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:48310->104.21.20.172:443: read: connection reset by peer"}
40
+ {"time":"2025-07-04T22:44:01.015059936+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
41
+ {"time":"2025-07-04T22:46:14.846692259+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
42
+ {"time":"2025-07-04T22:48:31.349968366+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39256->104.21.20.172:443: read: connection reset by peer"}
43
+ {"time":"2025-07-04T22:49:38.015743829+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38292->172.67.193.61:443: read: connection reset by peer"}
44
+ {"time":"2025-07-04T22:50:09.683679573+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": http2: client conn is closed"}
45
+ {"time":"2025-07-04T22:52:08.558045187+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:59988->104.21.20.172:443: read: connection reset by peer"}
46
+ {"time":"2025-07-04T22:54:22.824301514+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38894->172.67.193.61:443: read: connection reset by peer"}
47
+ {"time":"2025-07-04T22:59:26.307679579+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:51684->104.21.20.172:443: read: connection reset by peer"}
48
+ {"time":"2025-07-04T23:02:32.979198883+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:58210->172.67.193.61:443: read: connection timed out"}
49
+ {"time":"2025-07-04T23:10:21.459215862+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:35172->172.67.193.61:443: read: connection timed out"}
50
+ {"time":"2025-07-04T23:11:15.167490198+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:54942->104.21.20.172:443: read: connection reset by peer"}
51
+ {"time":"2025-07-04T23:16:10.863292487+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:43428->104.21.20.172:443: read: connection reset by peer"}
52
+ {"time":"2025-07-04T23:20:22.264503678+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
53
+ {"time":"2025-07-04T23:23:20.723212059+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38048->104.21.20.172:443: read: connection timed out"}
54
+ {"time":"2025-07-04T23:25:26.885991394+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:42070->172.67.193.61:443: read: connection reset by peer"}
55
+ {"time":"2025-07-04T23:26:43.632088162+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
56
+ {"time":"2025-07-04T23:29:40.115213912+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39740->104.21.20.172:443: read: connection timed out"}
57
+ {"time":"2025-07-04T23:30:43.633929839+08:00","level":"ERROR","msg":"sender: sendStopStatus: failed to get run stopped status: net/http: request canceled (Client.Timeout or context cancellation while reading body)"}
58
+ {"time":"2025-07-04T23:31:28.634946467+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
59
+ {"time":"2025-07-04T23:32:54.67521322+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:51016->172.67.193.61:443: read: connection timed out"}
60
+ {"time":"2025-07-04T23:39:42.739210995+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:58242->104.21.20.172:443: read: connection timed out"}
61
+ {"time":"2025-07-04T23:45:14.003206141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:41730->172.67.193.61:443: read: connection timed out"}
62
+ {"time":"2025-07-04T23:48:41.876197491+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39056->172.67.193.61:443: read: connection timed out"}
63
+ {"time":"2025-07-04T23:52:10.25916891+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56968->172.67.193.61:443: read: connection timed out"}
64
+ {"time":"2025-07-04T23:55:53.491192209+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:52384->172.67.193.61:443: read: connection timed out"}
65
+ {"time":"2025-07-04T23:58:33.235181122+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:41192->172.67.193.61:443: read: connection timed out"}
66
+ {"time":"2025-07-05T00:01:39.09118175+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:54870->172.67.193.61:443: read: connection timed out"}
67
+ {"time":"2025-07-05T00:03:30.660362141+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
68
+ {"time":"2025-07-05T00:06:22.739197171+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56300->104.21.20.172:443: read: connection timed out"}
69
+ {"time":"2025-07-05T00:07:01.140306954+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:33830->172.67.193.61:443: read: connection reset by peer"}
70
+ {"time":"2025-07-05T00:10:11.603177492+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38990->104.21.20.172:443: read: connection timed out"}
71
+ {"time":"2025-07-05T00:17:09.907203145+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:58654->104.21.20.172:443: read: connection timed out"}
72
+ {"time":"2025-07-05T00:20:10.131225125+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:39570->172.67.193.61:443: read: connection timed out"}
73
+ {"time":"2025-07-05T00:23:22.643197817+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:57940->172.67.193.61:443: read: connection timed out"}
74
+ {"time":"2025-07-05T00:26:13.651201419+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:37792->172.67.193.61:443: read: connection timed out"}
75
+ {"time":"2025-07-05T00:28:56.467221564+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:38640->104.21.20.172:443: read: connection timed out"}
76
+ {"time":"2025-07-05T00:32:27.924195852+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:33266->104.21.20.172:443: read: connection timed out"}
77
+ {"time":"2025-07-05T00:33:00.356828932+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:56794->104.21.20.172:443: read: connection reset by peer"}
78
+ {"time":"2025-07-05T00:36:52.115188168+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:44060->104.21.20.172:443: read: connection timed out"}
79
+ {"time":"2025-07-05T00:40:28.83076072+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/graphql\": context deadline exceeded"}
80
+ {"time":"2025-07-05T00:41:16.304855216+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
81
+ {"time":"2025-07-05T00:44:59.539204741+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:54050->104.21.20.172:443: read: connection timed out"}
82
+ {"time":"2025-07-05T00:46:47.38055068+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:47426->172.67.193.61:443: read: connection reset by peer"}
83
+ {"time":"2025-07-05T00:48:09.874069624+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:60378->104.21.20.172:443: read: connection reset by peer"}
84
+ {"time":"2025-07-05T00:50:24.818784704+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": unexpected EOF"}
85
+ {"time":"2025-07-05T01:02:13.78019116+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:44352->104.21.20.172:443: read: connection timed out"}
86
+ {"time":"2025-07-05T01:10:57.377024443+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": context deadline exceeded"}
87
+ {"time":"2025-07-05T01:20:40.211190451+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.bandw.top/files/gia0603yucca/stage2_07041521/ds7lrt6r/file_stream\": read tcp 10.1.8.17:50414->104.21.20.172:443: read: connection timed out"}
88
+ {"time":"2025-07-05T01:21:51.097875981+08:00","level":"INFO","msg":"stream: closing","id":"ds7lrt6r"}
89
+ {"time":"2025-07-05T01:21:51.097937445+08:00","level":"INFO","msg":"Stopping system monitor"}
90
+ {"time":"2025-07-05T01:21:51.099273597+08:00","level":"INFO","msg":"Stopped system monitor"}
91
+ {"time":"2025-07-05T01:21:58.380081154+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
92
+ {"time":"2025-07-05T01:22:06.296069471+08:00","level":"INFO","msg":"handler: closed","stream_id":"ds7lrt6r"}
93
+ {"time":"2025-07-05T01:22:06.296102451+08:00","level":"INFO","msg":"sender: closed","stream_id":"ds7lrt6r"}
94
+ {"time":"2025-07-05T01:22:06.296100202+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ds7lrt6r"}
95
+ {"time":"2025-07-05T01:22:06.302257653+08:00","level":"INFO","msg":"stream: closed","id":"ds7lrt6r"}
ProtT3/all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
2
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Configure stats pid to 56865
3
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
4
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Loading settings from /nas/shared/kilab/wangyujia/ProtT3/wandb/settings
5
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_setup.py:_flush():70] Loading settings from environment variables
6
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:setup_run_log_directory():724] Logging user logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug.log
7
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to ./all_checkpoints/stage2_07041521/wandb/run-20250704_154608-ds7lrt6r/logs/debug-internal.log
8
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():852] calling init triggers
9
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():893] starting backend
12
+ 2025-07-04 15:46:08,903 INFO MainThread:56865 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-07-04 15:46:08,914 INFO MainThread:56865 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-07-04 15:46:08,916 INFO MainThread:56865 [wandb_init.py:init():907] backend started and connected
15
+ 2025-07-04 15:46:08,917 INFO MainThread:56865 [wandb_init.py:init():1005] updated telemetry
16
+ 2025-07-04 15:46:08,922 INFO MainThread:56865 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
17
+ 2025-07-04 15:46:12,402 INFO MainThread:56865 [wandb_init.py:init():1104] starting run threads in backend
18
+ 2025-07-04 15:46:12,679 INFO MainThread:56865 [wandb_run.py:_console_start():2573] atexit reg
19
+ 2025-07-04 15:46:12,680 INFO MainThread:56865 [wandb_run.py:_redirect():2421] redirect: wrap_raw
20
+ 2025-07-04 15:46:12,685 INFO MainThread:56865 [wandb_run.py:_redirect():2490] Wrapping output streams.
21
+ 2025-07-04 15:46:12,686 INFO MainThread:56865 [wandb_run.py:_redirect():2513] Redirects installed.
22
+ 2025-07-04 15:46:12,697 INFO MainThread:56865 [wandb_init.py:init():1150] run started, returning control to user process
23
+ 2025-07-04 15:46:21,744 INFO MainThread:56865 [wandb_run.py:_config_callback():1436] config_cb None None {'filename': 'stage2_07041521', 'seed': 42, 'mode': 'train', 'strategy': 'deepspeed', 'accelerator': 'gpu', 'devices': '0,1,2,3,4,5,6,7', 'precision': 'bf16-mixed', 'max_epochs': 15, 'accumulate_grad_batches': 1, 'check_val_every_n_epoch': 1, 'enable_flash': False, 'use_wandb_logger': True, 'mix_dataset': True, 'save_every_n_epochs': 5, 'bert_name': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/microsoft', 'cross_attention_freq': 2, 'num_query_token': 8, 'llm_name': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'num_beams': 5, 'do_sample': False, 'max_inference_len': 128, 'min_inference_len': 1, 'llm_tune': 'mid_lora', 'peft_config': '', 'peft_dir': '', 'plm_model': '/nas/shared/kilab/wangyujia/ProtT3/plm_model/esm2-150m', 'plm_tune': 'freeze', 'lora_r': 8, 'lora_alpha': 16, 'lora_dropout': 0.1, 'enbale_gradient_checkpointing': False, 'weight_decay': 0.05, 'init_lr': 0.0001, 'min_lr': 1e-05, 'warmup_lr': 1e-06, 'warmup_steps': 1000, 'lr_decay_rate': 0.9, 'scheduler': 'linear_warmup_cosine_lr', 'stage1_path': '/nas/shared/kilab/wangyujia/ProtT3/all_checkpoints/stage1_06290009_deepspeed/epoch=19.ckpt/converted.ckpt', 'stage2_path': '', 'init_checkpoint': '', 'caption_eval_epoch': 10, 'num_workers': 8, 'batch_size': 32, 'inference_batch_size': 4, 'root': 'data', 'text_max_len': 128, 'q_max_len': 29, 'a_max_len': 36, 'prot_max_len': 1024, 'prompt': 'The protein has the following properties: ', 'filter_side_qa': False}
24
+ 2025-07-05 01:21:51,095 INFO MsgRouterThr:56865 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.